feat: Language detection improvements

This commit is contained in:
Dmitry Shirokov 2021-10-19 21:21:29 +11:00
parent 982eb02c45
commit d0e93bb2ab
No known key found for this signature in database
GPG Key ID: B5BDB86B2BCFF8EC
7 changed files with 62 additions and 26 deletions

View File

@ -1,2 +1,3 @@
lib lib
jest.config.js jest.config.js
testing.js

View File

@ -3,7 +3,7 @@ import { Match } from '../match';
export interface Recogniser { export interface Recogniser {
match(input: Context): Match | null; match(input: Context): Match | null;
name(input?: Context): string; name(input?: Context): string;
language?(): string; language?(): string | undefined;
} }
export interface Context { export interface Context {

View File

@ -1,17 +1,34 @@
import * as chardet from '..'; import * as chardet from '..';
import fs from 'fs';
import path from 'path';
describe('ISO-2022', () => { describe('ISO-2022', () => {
const base = __dirname + '/../test/data/encodings'; const base = __dirname + '/../test/data/encodings';
const analyse = (asset: string) =>
chardet.analyse(fs.readFileSync(path.join(base, asset))).shift();
it('should return ISO-2022-JP', () => { it('should return ISO-2022-JP', () => {
expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP'); expect(analyse('iso2022jp')).toEqual({
confidence: 100,
lang: 'ja',
name: 'ISO-2022-JP',
});
}); });
it('should return ISO-2022-KR', () => { it('should return ISO-2022-KR', () => {
expect(chardet.detectFileSync(base + '/iso2022kr')).toBe('ISO-2022-KR'); expect(analyse('iso2022kr')).toEqual({
confidence: 100,
lang: 'kr',
name: 'ISO-2022-KR',
});
}); });
it('should return ISO-2022-CN', () => { it('should return ISO-2022-CN', () => {
expect(chardet.detectFileSync(base + '/iso2022cn')).toBe('ISO-2022-CN'); expect(analyse('iso2022cn')).toEqual({
confidence: 100,
lang: 'zh',
name: 'ISO-2022-CN',
});
}); });
}); });

View File

@ -33,7 +33,7 @@ class ISO_2022 implements Recogniser {
let hits = 0; let hits = 0;
let misses = 0; let misses = 0;
let shifts = 0; let shifts = 0;
let quality; let confidence;
// TODO: refactor me // TODO: refactor me
const text = det.inputBytes; const text = det.inputBytes;
@ -73,14 +73,14 @@ class ISO_2022 implements Recogniser {
// All good: quality = 100; // All good: quality = 100;
// half or less good: quality = 0; // half or less good: quality = 0;
// linear in between. // linear in between.
quality = (100 * hits - 100 * misses) / (hits + misses); confidence = (100 * hits - 100 * misses) / (hits + misses);
// Back off quality if there were too few escape sequences seen. // Back off quality if there were too few escape sequences seen.
// Include shifts in this computation, so that KR does not get penalized // Include shifts in this computation, so that KR does not get penalized
// for having only a single Escape sequence, but many shifts. // for having only a single Escape sequence, but many shifts.
if (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10; if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10;
return quality <= 0 ? null : match(det, this, quality); return confidence <= 0 ? null : match(det, this, confidence);
} }
} }
@ -88,6 +88,11 @@ export class ISO_2022_JP extends ISO_2022 {
name() { name() {
return 'ISO-2022-JP'; return 'ISO-2022-JP';
} }
language() {
return 'ja';
}
escapeSequences = [ escapeSequences = [
[0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992 [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
[0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990 [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
@ -108,6 +113,9 @@ export class ISO_2022_KR extends ISO_2022 {
name() { name() {
return 'ISO-2022-KR'; return 'ISO-2022-KR';
} }
language() {
return 'kr';
}
escapeSequences = [[0x1b, 0x24, 0x29, 0x43]]; escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
} }
@ -115,6 +123,9 @@ export class ISO_2022_CN extends ISO_2022 {
name() { name() {
return 'ISO-2022-CN'; return 'ISO-2022-CN';
} }
language() {
return 'zh';
}
escapeSequences = [ escapeSequences = [
[0x1b, 0x24, 0x29, 0x41], // GB 2312-80 [0x1b, 0x24, 0x29, 0x41], // GB 2312-80
[0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1 [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1

View File

@ -110,6 +110,8 @@ const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] =>
class sbcs implements Recogniser { class sbcs implements Recogniser {
spaceChar = 0x20; spaceChar = 0x20;
private nGramLang?: string = undefined;
ngrams(): NGramsPlusLang[] | number[] { ngrams(): NGramsPlusLang[] | number[] {
return []; return [];
} }
@ -122,7 +124,16 @@ class sbcs implements Recogniser {
return 'sbcs'; return 'sbcs';
} }
language(): string | undefined {
return this.nGramLang;
}
match(det: Context): Match | null { match(det: Context): Match | null {
// This feels a bit dirty. Simpler alternative would be
// splitting classes ISO_8859_1 etc into language-specific ones
// with hardcoded languages like ISO_8859_9.
this.nGramLang = undefined;
const ngrams = this.ngrams(); const ngrams = this.ngrams();
if (isFlatNgrams(ngrams)) { if (isFlatNgrams(ngrams)) {
@ -131,24 +142,20 @@ class sbcs implements Recogniser {
return confidence <= 0 ? null : match(det, this, confidence); return confidence <= 0 ? null : match(det, this, confidence);
} }
let bestConfidenceSoFar = -1; let bestConfidence = -1;
let lang;
for (let i = ngrams.length - 1; i >= 0; i--) { for (let i = ngrams.length - 1; i >= 0; i--) {
const ngl = ngrams[i]; const ngl = ngrams[i];
const parser = new NGramParser(ngl.fNGrams, this.byteMap()); const parser = new NGramParser(ngl.fNGrams, this.byteMap());
const confidence = parser.parse(det, this.spaceChar); const confidence = parser.parse(det, this.spaceChar);
if (confidence > bestConfidenceSoFar) { if (confidence > bestConfidence) {
bestConfidenceSoFar = confidence; bestConfidence = confidence;
lang = ngl.fLang; this.nGramLang = ngl.fLang;
} }
} }
const name = this.name(det); return bestConfidence <= 0 ? null : match(det, this, bestConfidence);
return bestConfidenceSoFar <= 0
? null
: match(det, this, bestConfidenceSoFar, name, lang);
} }
} }

View File

@ -8,13 +8,13 @@ describe('chardet', () => {
const expectedEncodingsFromPath = [ const expectedEncodingsFromPath = [
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
{ 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined }, { 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' },
{ 'confidence': 10, 'name': 'Big5', 'lang': undefined }, { 'confidence': 10, 'name': 'Big5', 'lang': 'zh' },
{ 'confidence': 10, 'name': 'GB18030', 'lang': undefined }, { 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin
{ 'confidence': 10, 'name': 'windows-1253', 'lang': undefined }, { 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek
{ 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' }, { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
{ 'confidence': 4, 'name': 'windows-1254', 'lang': undefined }, { 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' },
{ 'confidence': 2, 'name': 'windows-1251', 'lang': undefined }, { 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' },
]; ];
it('has both named and default exports', () => { it('has both named and default exports', () => {

View File

@ -6,8 +6,8 @@ export interface Match {
lang?: string; lang?: string;
} }
export default (det: Context, rec: Recogniser, confidence: number, name?: string, lang?: string): Match => ({ export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({
confidence, confidence,
name: name || rec.name(det), name: rec.name(ctx),
lang, lang: rec.language ? rec.language() : undefined,
}); });