feat: Language detection improvements
This commit is contained in:
parent
982eb02c45
commit
d0e93bb2ab
|
@ -1,2 +1,3 @@
|
||||||
lib
|
lib
|
||||||
jest.config.js
|
jest.config.js
|
||||||
|
testing.js
|
||||||
|
|
|
@ -3,7 +3,7 @@ import { Match } from '../match';
|
||||||
export interface Recogniser {
|
export interface Recogniser {
|
||||||
match(input: Context): Match | null;
|
match(input: Context): Match | null;
|
||||||
name(input?: Context): string;
|
name(input?: Context): string;
|
||||||
language?(): string;
|
language?(): string | undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface Context {
|
export interface Context {
|
||||||
|
|
|
@ -1,17 +1,34 @@
|
||||||
import * as chardet from '..';
|
import * as chardet from '..';
|
||||||
|
import fs from 'fs';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
describe('ISO-2022', () => {
|
describe('ISO-2022', () => {
|
||||||
const base = __dirname + '/../test/data/encodings';
|
const base = __dirname + '/../test/data/encodings';
|
||||||
|
|
||||||
|
const analyse = (asset: string) =>
|
||||||
|
chardet.analyse(fs.readFileSync(path.join(base, asset))).shift();
|
||||||
|
|
||||||
it('should return ISO-2022-JP', () => {
|
it('should return ISO-2022-JP', () => {
|
||||||
expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP');
|
expect(analyse('iso2022jp')).toEqual({
|
||||||
|
confidence: 100,
|
||||||
|
lang: 'ja',
|
||||||
|
name: 'ISO-2022-JP',
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return ISO-2022-KR', () => {
|
it('should return ISO-2022-KR', () => {
|
||||||
expect(chardet.detectFileSync(base + '/iso2022kr')).toBe('ISO-2022-KR');
|
expect(analyse('iso2022kr')).toEqual({
|
||||||
|
confidence: 100,
|
||||||
|
lang: 'kr',
|
||||||
|
name: 'ISO-2022-KR',
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return ISO-2022-CN', () => {
|
it('should return ISO-2022-CN', () => {
|
||||||
expect(chardet.detectFileSync(base + '/iso2022cn')).toBe('ISO-2022-CN');
|
expect(analyse('iso2022cn')).toEqual({
|
||||||
|
confidence: 100,
|
||||||
|
lang: 'zh',
|
||||||
|
name: 'ISO-2022-CN',
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -33,7 +33,7 @@ class ISO_2022 implements Recogniser {
|
||||||
let hits = 0;
|
let hits = 0;
|
||||||
let misses = 0;
|
let misses = 0;
|
||||||
let shifts = 0;
|
let shifts = 0;
|
||||||
let quality;
|
let confidence;
|
||||||
|
|
||||||
// TODO: refactor me
|
// TODO: refactor me
|
||||||
const text = det.inputBytes;
|
const text = det.inputBytes;
|
||||||
|
@ -73,14 +73,14 @@ class ISO_2022 implements Recogniser {
|
||||||
// All good: quality = 100;
|
// All good: quality = 100;
|
||||||
// half or less good: quality = 0;
|
// half or less good: quality = 0;
|
||||||
// linear in between.
|
// linear in between.
|
||||||
quality = (100 * hits - 100 * misses) / (hits + misses);
|
confidence = (100 * hits - 100 * misses) / (hits + misses);
|
||||||
|
|
||||||
// Back off quality if there were too few escape sequences seen.
|
// Back off quality if there were too few escape sequences seen.
|
||||||
// Include shifts in this computation, so that KR does not get penalized
|
// Include shifts in this computation, so that KR does not get penalized
|
||||||
// for having only a single Escape sequence, but many shifts.
|
// for having only a single Escape sequence, but many shifts.
|
||||||
if (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10;
|
if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10;
|
||||||
|
|
||||||
return quality <= 0 ? null : match(det, this, quality);
|
return confidence <= 0 ? null : match(det, this, confidence);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -88,6 +88,11 @@ export class ISO_2022_JP extends ISO_2022 {
|
||||||
name() {
|
name() {
|
||||||
return 'ISO-2022-JP';
|
return 'ISO-2022-JP';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
language() {
|
||||||
|
return 'ja';
|
||||||
|
}
|
||||||
|
|
||||||
escapeSequences = [
|
escapeSequences = [
|
||||||
[0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
|
[0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
|
||||||
[0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
|
[0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
|
||||||
|
@ -108,6 +113,9 @@ export class ISO_2022_KR extends ISO_2022 {
|
||||||
name() {
|
name() {
|
||||||
return 'ISO-2022-KR';
|
return 'ISO-2022-KR';
|
||||||
}
|
}
|
||||||
|
language() {
|
||||||
|
return 'kr';
|
||||||
|
}
|
||||||
escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
|
escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,6 +123,9 @@ export class ISO_2022_CN extends ISO_2022 {
|
||||||
name() {
|
name() {
|
||||||
return 'ISO-2022-CN';
|
return 'ISO-2022-CN';
|
||||||
}
|
}
|
||||||
|
language() {
|
||||||
|
return 'zh';
|
||||||
|
}
|
||||||
escapeSequences = [
|
escapeSequences = [
|
||||||
[0x1b, 0x24, 0x29, 0x41], // GB 2312-80
|
[0x1b, 0x24, 0x29, 0x41], // GB 2312-80
|
||||||
[0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
|
[0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
|
||||||
|
|
|
@ -110,6 +110,8 @@ const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] =>
|
||||||
class sbcs implements Recogniser {
|
class sbcs implements Recogniser {
|
||||||
spaceChar = 0x20;
|
spaceChar = 0x20;
|
||||||
|
|
||||||
|
private nGramLang?: string = undefined;
|
||||||
|
|
||||||
ngrams(): NGramsPlusLang[] | number[] {
|
ngrams(): NGramsPlusLang[] | number[] {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
@ -122,7 +124,16 @@ class sbcs implements Recogniser {
|
||||||
return 'sbcs';
|
return 'sbcs';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
language(): string | undefined {
|
||||||
|
return this.nGramLang;
|
||||||
|
}
|
||||||
|
|
||||||
match(det: Context): Match | null {
|
match(det: Context): Match | null {
|
||||||
|
// This feels a bit dirty. Simpler alternative would be
|
||||||
|
// splitting classes ISO_8859_1 etc into language-specific ones
|
||||||
|
// with hardcoded languages like ISO_8859_9.
|
||||||
|
this.nGramLang = undefined;
|
||||||
|
|
||||||
const ngrams = this.ngrams();
|
const ngrams = this.ngrams();
|
||||||
|
|
||||||
if (isFlatNgrams(ngrams)) {
|
if (isFlatNgrams(ngrams)) {
|
||||||
|
@ -131,24 +142,20 @@ class sbcs implements Recogniser {
|
||||||
return confidence <= 0 ? null : match(det, this, confidence);
|
return confidence <= 0 ? null : match(det, this, confidence);
|
||||||
}
|
}
|
||||||
|
|
||||||
let bestConfidenceSoFar = -1;
|
let bestConfidence = -1;
|
||||||
let lang;
|
|
||||||
|
|
||||||
for (let i = ngrams.length - 1; i >= 0; i--) {
|
for (let i = ngrams.length - 1; i >= 0; i--) {
|
||||||
const ngl = ngrams[i];
|
const ngl = ngrams[i];
|
||||||
|
|
||||||
const parser = new NGramParser(ngl.fNGrams, this.byteMap());
|
const parser = new NGramParser(ngl.fNGrams, this.byteMap());
|
||||||
const confidence = parser.parse(det, this.spaceChar);
|
const confidence = parser.parse(det, this.spaceChar);
|
||||||
if (confidence > bestConfidenceSoFar) {
|
if (confidence > bestConfidence) {
|
||||||
bestConfidenceSoFar = confidence;
|
bestConfidence = confidence;
|
||||||
lang = ngl.fLang;
|
this.nGramLang = ngl.fLang;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const name = this.name(det);
|
return bestConfidence <= 0 ? null : match(det, this, bestConfidence);
|
||||||
return bestConfidenceSoFar <= 0
|
|
||||||
? null
|
|
||||||
: match(det, this, bestConfidenceSoFar, name, lang);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,13 +8,13 @@ describe('chardet', () => {
|
||||||
const expectedEncodingsFromPath = [
|
const expectedEncodingsFromPath = [
|
||||||
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
|
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
|
||||||
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
|
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
|
||||||
{ 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined },
|
{ 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' },
|
||||||
{ 'confidence': 10, 'name': 'Big5', 'lang': undefined },
|
{ 'confidence': 10, 'name': 'Big5', 'lang': 'zh' },
|
||||||
{ 'confidence': 10, 'name': 'GB18030', 'lang': undefined },
|
{ 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin
|
||||||
{ 'confidence': 10, 'name': 'windows-1253', 'lang': undefined },
|
{ 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek
|
||||||
{ 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
|
{ 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
|
||||||
{ 'confidence': 4, 'name': 'windows-1254', 'lang': undefined },
|
{ 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' },
|
||||||
{ 'confidence': 2, 'name': 'windows-1251', 'lang': undefined },
|
{ 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' },
|
||||||
];
|
];
|
||||||
|
|
||||||
it('has both named and default exports', () => {
|
it('has both named and default exports', () => {
|
||||||
|
|
|
@ -6,8 +6,8 @@ export interface Match {
|
||||||
lang?: string;
|
lang?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default (det: Context, rec: Recogniser, confidence: number, name?: string, lang?: string): Match => ({
|
export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({
|
||||||
confidence,
|
confidence,
|
||||||
name: name || rec.name(det),
|
name: rec.name(ctx),
|
||||||
lang,
|
lang: rec.language ? rec.language() : undefined,
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue