chardet/src/encoding/iso2022.ts

import { Context, Recogniser } from '.';
import match, { Match } from '../match';

/**
 * This is a superclass for the individual detectors for
 * each of the detectable members of the ISO 2022 family
 * of encodings.
 */

class ISO_2022 implements Recogniser {
  escapeSequences: number[][] = [];

  name() {
    return 'ISO_2022';
  }

  match(det: Context): Match | null {
    /**
     * Matching function shared among the 2022 detectors JP, CN and KR
     * Counts up the number of legal an unrecognized escape sequences in
     * the sample of text, and computes a score based on the total number &
     * the proportion that fit the encoding.
     *
     *
     * @param text the byte buffer containing text to analyse
     * @param textLen  the size of the text in the byte.
     * @param escapeSequences the byte escape sequences to test for.
     * @return match quality, in the range of 0-100.
     */

    let i, j;
    let escN;
    let hits = 0;
    let misses = 0;
    let shifts = 0;
    let confidence;

    // TODO: refactor me
    const text = det.inputBytes;
    const textLen = det.inputLen;

    scanInput: for (i = 0; i < textLen; i++) {
      if (text[i] == 0x1b) {
        checkEscapes: for (
          escN = 0;
          escN < this.escapeSequences.length;
          escN++
        ) {
          const seq = this.escapeSequences[escN];

          if (textLen - i < seq.length) continue checkEscapes;

          for (j = 1; j < seq.length; j++)
            if (seq[j] != text[i + j]) continue checkEscapes;

          hits++;
          i += seq.length - 1;
          continue scanInput;
        }

        misses++;
      }

      // Shift in/out
      if (text[i] == 0x0e || text[i] == 0x0f) shifts++;
    }

    if (hits == 0) return null;

    //
    // Initial quality is based on relative proportion of recognized vs.
    //   unrecognized escape sequences.
    //   All good:  quality = 100;
    //   half or less good: quality = 0;
    //   linear in between.
    confidence = (100 * hits - 100 * misses) / (hits + misses);

    // Back off quality if there were too few escape sequences seen.
    //   Include shifts in this computation, so that KR does not get penalized
    //   for having only a single Escape sequence, but many shifts.
    if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10;

    return confidence <= 0 ? null : match(det, this, confidence);
  }
}

export class ISO_2022_JP extends ISO_2022 {
  name() {
    return 'ISO-2022-JP';
  }

  language() {
    return 'ja';
  }

  escapeSequences = [
    [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
    [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
    [0x1b, 0x24, 0x40], // JIS C 6226-1978
    [0x1b, 0x24, 0x41], // GB 2312-80
    [0x1b, 0x24, 0x42], // JIS X 208-1983
    [0x1b, 0x26, 0x40], // JIS X 208 1990, 1997
    [0x1b, 0x28, 0x42], // ASCII
    [0x1b, 0x28, 0x48], // JIS-Roman
    [0x1b, 0x28, 0x49], // Half-width katakana
    [0x1b, 0x28, 0x4a], // JIS-Roman
    [0x1b, 0x2e, 0x41], // ISO 8859-1
    [0x1b, 0x2e, 0x46], // ISO 8859-7
  ];
}

export class ISO_2022_KR extends ISO_2022 {
  name() {
    return 'ISO-2022-KR';
  }
  language() {
    return 'kr';
  }
  escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
}

export class ISO_2022_CN extends ISO_2022 {
  name() {
    return 'ISO-2022-CN';
  }
  language() {
    return 'zh';
  }
  escapeSequences = [
    [0x1b, 0x24, 0x29, 0x41], // GB 2312-80
    [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
    [0x1b, 0x24, 0x2a, 0x48], // CNS 11643-1992 Plane 2
    [0x1b, 0x24, 0x29, 0x45], // ISO-IR-165
    [0x1b, 0x24, 0x2b, 0x49], // CNS 11643-1992 Plane 3
    [0x1b, 0x24, 0x2b, 0x4a], // CNS 11643-1992 Plane 4
    [0x1b, 0x24, 0x2b, 0x4b], // CNS 11643-1992 Plane 5
    [0x1b, 0x24, 0x2b, 0x4c], // CNS 11643-1992 Plane 6
    [0x1b, 0x24, 0x2b, 0x4d], // CNS 11643-1992 Plane 7
    [0x1b, 0x4e], // SS2
    [0x1b, 0x4f], // SS3
  ];
}