From d0e93bb2abea853145721e6c0b18ffcb8d2bbea8 Mon Sep 17 00:00:00 2001
From: Dmitry Shirokov <dshirokov@seek.com.au>
Date: Tue, 19 Oct 2021 21:21:29 +1100
Subject: [PATCH] feat: Language detection improvements

---
 .eslintignore                |  1 +
 src/encoding/index.ts        |  2 +-
 src/encoding/iso2022.test.ts | 23 ++++++++++++++++++++---
 src/encoding/iso2022.ts      | 19 +++++++++++++++----
 src/encoding/sbcs.ts         | 25 ++++++++++++++++---------
 src/index.test.ts            | 12 ++++++------
 src/match.ts                 |  6 +++---
 7 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/.eslintignore b/.eslintignore
index d060117..8edbac4 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -1,2 +1,3 @@
 lib
 jest.config.js
+testing.js
diff --git a/src/encoding/index.ts b/src/encoding/index.ts
index 01686f1..cad5221 100644
--- a/src/encoding/index.ts
+++ b/src/encoding/index.ts
@@ -3,7 +3,7 @@ import { Match } from '../match';
 export interface Recogniser {
   match(input: Context): Match | null;
   name(input?: Context): string;
-  language?(): string;
+  language?(): string | undefined;
 }
 
 export interface Context {
diff --git a/src/encoding/iso2022.test.ts b/src/encoding/iso2022.test.ts
index a074599..21606a0 100644
--- a/src/encoding/iso2022.test.ts
+++ b/src/encoding/iso2022.test.ts
@@ -1,17 +1,34 @@
 import * as chardet from '..';
+import fs from 'fs';
+import path from 'path';
 
 describe('ISO-2022', () => {
   const base = __dirname + '/../test/data/encodings';
 
+  const analyse = (asset: string) =>
+    chardet.analyse(fs.readFileSync(path.join(base, asset))).shift();
+
   it('should return ISO-2022-JP', () => {
-    expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP');
+    expect(analyse('iso2022jp')).toEqual({
+      confidence: 100,
+      lang: 'ja',
+      name: 'ISO-2022-JP',
+    });
   });
 
   it('should return ISO-2022-KR', () => {
-    expect(chardet.detectFileSync(base + '/iso2022kr')).toBe('ISO-2022-KR');
+    expect(analyse('iso2022kr')).toEqual({
+      confidence: 100,
+      lang: 'kr',
+      name: 'ISO-2022-KR',
+    });
   });
 
   it('should return ISO-2022-CN', () => {
-    expect(chardet.detectFileSync(base + '/iso2022cn')).toBe('ISO-2022-CN');
+    expect(analyse('iso2022cn')).toEqual({
+      confidence: 100,
+      lang: 'zh',
+      name: 'ISO-2022-CN',
+    });
   });
 });
diff --git a/src/encoding/iso2022.ts b/src/encoding/iso2022.ts
index 46aa974..598e283 100644
--- a/src/encoding/iso2022.ts
+++ b/src/encoding/iso2022.ts
@@ -33,7 +33,7 @@ class ISO_2022 implements Recogniser {
     let hits = 0;
     let misses = 0;
     let shifts = 0;
-    let quality;
+    let confidence;
 
     // TODO: refactor me
     const text = det.inputBytes;
@@ -73,14 +73,14 @@ class ISO_2022 implements Recogniser {
     //   All good:  quality = 100;
     //   half or less good: quality = 0;
     //   linear in between.
-    quality = (100 * hits - 100 * misses) / (hits + misses);
+    confidence = (100 * hits - 100 * misses) / (hits + misses);
 
     // Back off quality if there were too few escape sequences seen.
     //   Include shifts in this computation, so that KR does not get penalized
     //   for having only a single Escape sequence, but many shifts.
-    if (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10;
+    if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10;
 
-    return quality <= 0 ? null : match(det, this, quality);
+    return confidence <= 0 ? null : match(det, this, confidence);
   }
 }
 
@@ -88,6 +88,11 @@ export class ISO_2022_JP extends ISO_2022 {
   name() {
     return 'ISO-2022-JP';
   }
+
+  language() {
+    return 'ja';
+  }
+
   escapeSequences = [
     [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
     [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
@@ -108,6 +113,9 @@ export class ISO_2022_KR extends ISO_2022 {
   name() {
     return 'ISO-2022-KR';
   }
+  language() {
+    return 'kr';
+  }
   escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
 }
 
@@ -115,6 +123,9 @@ export class ISO_2022_CN extends ISO_2022 {
   name() {
     return 'ISO-2022-CN';
   }
+  language() {
+    return 'zh';
+  }
   escapeSequences = [
     [0x1b, 0x24, 0x29, 0x41], // GB 2312-80
     [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
diff --git a/src/encoding/sbcs.ts b/src/encoding/sbcs.ts
index 1484413..bf17036 100644
--- a/src/encoding/sbcs.ts
+++ b/src/encoding/sbcs.ts
@@ -110,6 +110,8 @@ const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] =>
 class sbcs implements Recogniser {
   spaceChar = 0x20;
 
+  private nGramLang?: string = undefined;
+
   ngrams(): NGramsPlusLang[] | number[] {
     return [];
   }
@@ -122,7 +124,16 @@ class sbcs implements Recogniser {
     return 'sbcs';
   }
 
+  language(): string | undefined {
+    return this.nGramLang;
+  }
+
   match(det: Context): Match | null {
+    // This feels a bit dirty. Simpler alternative would be
+    // splitting classes ISO_8859_1 etc into language-specific ones
+    // with hardcoded languages like ISO_8859_9.
+    this.nGramLang = undefined;
+
     const ngrams = this.ngrams();
 
     if (isFlatNgrams(ngrams)) {
@@ -131,24 +142,20 @@ class sbcs implements Recogniser {
       return confidence <= 0 ? null : match(det, this, confidence);
     }
 
-    let bestConfidenceSoFar = -1;
-    let lang;
+    let bestConfidence = -1;
 
     for (let i = ngrams.length - 1; i >= 0; i--) {
       const ngl = ngrams[i];
 
       const parser = new NGramParser(ngl.fNGrams, this.byteMap());
       const confidence = parser.parse(det, this.spaceChar);
-      if (confidence > bestConfidenceSoFar) {
-        bestConfidenceSoFar = confidence;
-        lang = ngl.fLang;
+      if (confidence > bestConfidence) {
+        bestConfidence = confidence;
+        this.nGramLang = ngl.fLang;
       }
     }
 
-    const name = this.name(det);
-    return bestConfidenceSoFar <= 0
-      ? null
-      : match(det, this, bestConfidenceSoFar, name, lang);
+    return bestConfidence <= 0 ? null : match(det, this, bestConfidence);
   }
 }
 
diff --git a/src/index.test.ts b/src/index.test.ts
index 6a1f403..d1eba76 100644
--- a/src/index.test.ts
+++ b/src/index.test.ts
@@ -8,13 +8,13 @@ describe('chardet', () => {
   const expectedEncodingsFromPath = [
     { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
     { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
-    { 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined },
-    { 'confidence': 10, 'name': 'Big5', 'lang': undefined },
-    { 'confidence': 10, 'name': 'GB18030', 'lang': undefined },
-    { 'confidence': 10, 'name': 'windows-1253', 'lang': undefined },
+    { 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' },
+    { 'confidence': 10, 'name': 'Big5', 'lang': 'zh' },
+    { 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin
+    { 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek
     { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
-    { 'confidence': 4, 'name': 'windows-1254', 'lang': undefined },
-    { 'confidence': 2, 'name': 'windows-1251', 'lang': undefined },
+    { 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' },
+    { 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' },
   ];
 
   it('has both named and default exports', () => {
diff --git a/src/match.ts b/src/match.ts
index 6245fc6..00c262c 100644
--- a/src/match.ts
+++ b/src/match.ts
@@ -6,8 +6,8 @@ export interface Match {
   lang?: string;
 }
 
-export default (det: Context, rec: Recogniser, confidence: number, name?: string, lang?: string): Match => ({
+export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({
   confidence,
-  name: name || rec.name(det),
-  lang,
+  name: rec.name(ctx),
+  lang: rec.language ? rec.language() : undefined,
 });