From d0e93bb2abea853145721e6c0b18ffcb8d2bbea8 Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Tue, 19 Oct 2021 21:21:29 +1100 Subject: [PATCH] feat: Language detection improvements --- .eslintignore | 1 + src/encoding/index.ts | 2 +- src/encoding/iso2022.test.ts | 23 ++++++++++++++++++++--- src/encoding/iso2022.ts | 19 +++++++++++++++---- src/encoding/sbcs.ts | 25 ++++++++++++++++--------- src/index.test.ts | 12 ++++++------ src/match.ts | 6 +++--- 7 files changed, 62 insertions(+), 26 deletions(-) diff --git a/.eslintignore b/.eslintignore index d060117..8edbac4 100644 --- a/.eslintignore +++ b/.eslintignore @@ -1,2 +1,3 @@ lib jest.config.js +testing.js diff --git a/src/encoding/index.ts b/src/encoding/index.ts index 01686f1..cad5221 100644 --- a/src/encoding/index.ts +++ b/src/encoding/index.ts @@ -3,7 +3,7 @@ import { Match } from '../match'; export interface Recogniser { match(input: Context): Match | null; name(input?: Context): string; - language?(): string; + language?(): string | undefined; } export interface Context { diff --git a/src/encoding/iso2022.test.ts b/src/encoding/iso2022.test.ts index a074599..21606a0 100644 --- a/src/encoding/iso2022.test.ts +++ b/src/encoding/iso2022.test.ts @@ -1,17 +1,34 @@ import * as chardet from '..'; +import fs from 'fs'; +import path from 'path'; describe('ISO-2022', () => { const base = __dirname + '/../test/data/encodings'; + const analyse = (asset: string) => + chardet.analyse(fs.readFileSync(path.join(base, asset))).shift(); + it('should return ISO-2022-JP', () => { - expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP'); + expect(analyse('iso2022jp')).toEqual({ + confidence: 100, + lang: 'ja', + name: 'ISO-2022-JP', + }); }); it('should return ISO-2022-KR', () => { - expect(chardet.detectFileSync(base + '/iso2022kr')).toBe('ISO-2022-KR'); + expect(analyse('iso2022kr')).toEqual({ + confidence: 100, + lang: 'kr', + name: 'ISO-2022-KR', + }); }); it('should return ISO-2022-CN', () => { - expect(chardet.detectFileSync(base + '/iso2022cn')).toBe('ISO-2022-CN'); + expect(analyse('iso2022cn')).toEqual({ + confidence: 100, + lang: 'zh', + name: 'ISO-2022-CN', + }); }); }); diff --git a/src/encoding/iso2022.ts b/src/encoding/iso2022.ts index 46aa974..598e283 100644 --- a/src/encoding/iso2022.ts +++ b/src/encoding/iso2022.ts @@ -33,7 +33,7 @@ class ISO_2022 implements Recogniser { let hits = 0; let misses = 0; let shifts = 0; - let quality; + let confidence; // TODO: refactor me const text = det.inputBytes; @@ -73,14 +73,14 @@ class ISO_2022 implements Recogniser { // All good: quality = 100; // half or less good: quality = 0; // linear in between. - quality = (100 * hits - 100 * misses) / (hits + misses); + confidence = (100 * hits - 100 * misses) / (hits + misses); // Back off quality if there were too few escape sequences seen. // Include shifts in this computation, so that KR does not get penalized // for having only a single Escape sequence, but many shifts. - if (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10; + if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10; - return quality <= 0 ? null : match(det, this, quality); + return confidence <= 0 ? null : match(det, this, confidence); } } @@ -88,6 +88,11 @@ export class ISO_2022_JP extends ISO_2022 { name() { return 'ISO-2022-JP'; } + + language() { + return 'ja'; + } + escapeSequences = [ [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992 [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990 @@ -108,6 +113,9 @@ export class ISO_2022_KR extends ISO_2022 { name() { return 'ISO-2022-KR'; } + language() { + return 'kr'; + } escapeSequences = [[0x1b, 0x24, 0x29, 0x43]]; } @@ -115,6 +123,9 @@ export class ISO_2022_CN extends ISO_2022 { name() { return 'ISO-2022-CN'; } + language() { + return 'zh'; + } escapeSequences = [ [0x1b, 0x24, 0x29, 0x41], // GB 2312-80 [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1 diff --git a/src/encoding/sbcs.ts b/src/encoding/sbcs.ts index 1484413..bf17036 100644 --- a/src/encoding/sbcs.ts +++ b/src/encoding/sbcs.ts @@ -110,6 +110,8 @@ const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] => class sbcs implements Recogniser { spaceChar = 0x20; + private nGramLang?: string = undefined; + ngrams(): NGramsPlusLang[] | number[] { return []; } @@ -122,7 +124,16 @@ class sbcs implements Recogniser { return 'sbcs'; } + language(): string | undefined { + return this.nGramLang; + } + match(det: Context): Match | null { + // This feels a bit dirty. Simpler alternative would be + // splitting classes ISO_8859_1 etc into language-specific ones + // with hardcoded languages like ISO_8859_9. + this.nGramLang = undefined; + const ngrams = this.ngrams(); if (isFlatNgrams(ngrams)) { @@ -131,24 +142,20 @@ class sbcs implements Recogniser { return confidence <= 0 ? null : match(det, this, confidence); } - let bestConfidenceSoFar = -1; - let lang; + let bestConfidence = -1; for (let i = ngrams.length - 1; i >= 0; i--) { const ngl = ngrams[i]; const parser = new NGramParser(ngl.fNGrams, this.byteMap()); const confidence = parser.parse(det, this.spaceChar); - if (confidence > bestConfidenceSoFar) { - bestConfidenceSoFar = confidence; - lang = ngl.fLang; + if (confidence > bestConfidence) { + bestConfidence = confidence; + this.nGramLang = ngl.fLang; } } - const name = this.name(det); - return bestConfidenceSoFar <= 0 - ? null - : match(det, this, bestConfidenceSoFar, name, lang); + return bestConfidence <= 0 ? null : match(det, this, bestConfidence); } } diff --git a/src/index.test.ts b/src/index.test.ts index 6a1f403..d1eba76 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -8,13 +8,13 @@ describe('chardet', () => { const expectedEncodingsFromPath = [ { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, - { 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined }, - { 'confidence': 10, 'name': 'Big5', 'lang': undefined }, - { 'confidence': 10, 'name': 'GB18030', 'lang': undefined }, - { 'confidence': 10, 'name': 'windows-1253', 'lang': undefined }, + { 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' }, + { 'confidence': 10, 'name': 'Big5', 'lang': 'zh' }, + { 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin + { 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' }, - { 'confidence': 4, 'name': 'windows-1254', 'lang': undefined }, - { 'confidence': 2, 'name': 'windows-1251', 'lang': undefined }, + { 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' }, + { 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' }, ]; it('has both named and default exports', () => { diff --git a/src/match.ts b/src/match.ts index 6245fc6..00c262c 100644 --- a/src/match.ts +++ b/src/match.ts @@ -6,8 +6,8 @@ export interface Match { lang?: string; } -export default (det: Context, rec: Recogniser, confidence: number, name?: string, lang?: string): Match => ({ +export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({ confidence, - name: name || rec.name(det), - lang, + name: rec.name(ctx), + lang: rec.language ? rec.language() : undefined, });