2020-03-30 03:42:04 +00:00
|
|
|
import { Context, Recogniser } from '.';
|
2020-09-23 03:10:34 +00:00
|
|
|
import match, { Match } from '../match';
|
2020-03-30 03:42:04 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This is a superclass for the individual detectors for
|
|
|
|
* each of the detectable members of the ISO 2022 family
|
|
|
|
* of encodings.
|
|
|
|
*/
|
|
|
|
|
|
|
|
class ISO_2022 implements Recogniser {
|
|
|
|
escapeSequences: number[][] = [];
|
|
|
|
|
|
|
|
name() {
|
|
|
|
return 'ISO_2022';
|
|
|
|
}
|
|
|
|
|
2020-09-23 03:06:43 +00:00
|
|
|
match(det: Context): Match | null {
|
2020-03-30 03:42:04 +00:00
|
|
|
/**
|
|
|
|
* Matching function shared among the 2022 detectors JP, CN and KR
|
|
|
|
* Counts up the number of legal an unrecognized escape sequences in
|
|
|
|
* the sample of text, and computes a score based on the total number &
|
|
|
|
* the proportion that fit the encoding.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* @param text the byte buffer containing text to analyse
|
|
|
|
* @param textLen the size of the text in the byte.
|
|
|
|
* @param escapeSequences the byte escape sequences to test for.
|
|
|
|
* @return match quality, in the range of 0-100.
|
|
|
|
*/
|
|
|
|
|
2020-09-23 02:16:38 +00:00
|
|
|
let i, j;
|
|
|
|
let escN;
|
|
|
|
let hits = 0;
|
|
|
|
let misses = 0;
|
|
|
|
let shifts = 0;
|
|
|
|
let quality;
|
2020-03-30 03:42:04 +00:00
|
|
|
|
|
|
|
// TODO: refactor me
|
2020-09-23 02:16:38 +00:00
|
|
|
const text = det.fInputBytes;
|
|
|
|
const textLen = det.fInputLen;
|
2020-03-30 03:42:04 +00:00
|
|
|
|
|
|
|
scanInput: for (i = 0; i < textLen; i++) {
|
|
|
|
if (text[i] == 0x1b) {
|
|
|
|
checkEscapes: for (
|
|
|
|
escN = 0;
|
|
|
|
escN < this.escapeSequences.length;
|
|
|
|
escN++
|
|
|
|
) {
|
2020-09-23 02:16:38 +00:00
|
|
|
const seq = this.escapeSequences[escN];
|
2020-03-30 03:42:04 +00:00
|
|
|
|
|
|
|
if (textLen - i < seq.length) continue checkEscapes;
|
|
|
|
|
|
|
|
for (j = 1; j < seq.length; j++)
|
|
|
|
if (seq[j] != text[i + j]) continue checkEscapes;
|
|
|
|
|
|
|
|
hits++;
|
|
|
|
i += seq.length - 1;
|
|
|
|
continue scanInput;
|
|
|
|
}
|
|
|
|
|
|
|
|
misses++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Shift in/out
|
|
|
|
if (text[i] == 0x0e || text[i] == 0x0f) shifts++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hits == 0) return null;
|
|
|
|
|
|
|
|
//
|
|
|
|
// Initial quality is based on relative proportion of recognized vs.
|
|
|
|
// unrecognized escape sequences.
|
|
|
|
// All good: quality = 100;
|
|
|
|
// half or less good: quality = 0;
|
|
|
|
// linear in between.
|
|
|
|
quality = (100 * hits - 100 * misses) / (hits + misses);
|
|
|
|
|
|
|
|
// Back off quality if there were too few escape sequences seen.
|
|
|
|
// Include shifts in this computation, so that KR does not get penalized
|
|
|
|
// for having only a single Escape sequence, but many shifts.
|
|
|
|
if (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10;
|
|
|
|
|
|
|
|
return quality <= 0 ? null : match(det, this, quality);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
export class ISO_2022_JP extends ISO_2022 {
|
|
|
|
name() {
|
|
|
|
return 'ISO-2022-JP';
|
|
|
|
}
|
|
|
|
escapeSequences = [
|
|
|
|
[0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
|
|
|
|
[0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
|
|
|
|
[0x1b, 0x24, 0x40], // JIS C 6226-1978
|
|
|
|
[0x1b, 0x24, 0x41], // GB 2312-80
|
|
|
|
[0x1b, 0x24, 0x42], // JIS X 208-1983
|
|
|
|
[0x1b, 0x26, 0x40], // JIS X 208 1990, 1997
|
|
|
|
[0x1b, 0x28, 0x42], // ASCII
|
|
|
|
[0x1b, 0x28, 0x48], // JIS-Roman
|
|
|
|
[0x1b, 0x28, 0x49], // Half-width katakana
|
|
|
|
[0x1b, 0x28, 0x4a], // JIS-Roman
|
|
|
|
[0x1b, 0x2e, 0x41], // ISO 8859-1
|
|
|
|
[0x1b, 0x2e, 0x46], // ISO 8859-7
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
export class ISO_2022_KR extends ISO_2022 {
|
|
|
|
name() {
|
|
|
|
return 'ISO-2022-KR';
|
|
|
|
}
|
|
|
|
escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
|
|
|
|
}
|
|
|
|
|
|
|
|
export class ISO_2022_CN extends ISO_2022 {
|
|
|
|
name() {
|
|
|
|
return 'ISO-2022-CN';
|
|
|
|
}
|
|
|
|
escapeSequences = [
|
|
|
|
[0x1b, 0x24, 0x29, 0x41], // GB 2312-80
|
|
|
|
[0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
|
|
|
|
[0x1b, 0x24, 0x2a, 0x48], // CNS 11643-1992 Plane 2
|
|
|
|
[0x1b, 0x24, 0x29, 0x45], // ISO-IR-165
|
|
|
|
[0x1b, 0x24, 0x2b, 0x49], // CNS 11643-1992 Plane 3
|
|
|
|
[0x1b, 0x24, 0x2b, 0x4a], // CNS 11643-1992 Plane 4
|
|
|
|
[0x1b, 0x24, 0x2b, 0x4b], // CNS 11643-1992 Plane 5
|
|
|
|
[0x1b, 0x24, 0x2b, 0x4c], // CNS 11643-1992 Plane 6
|
|
|
|
[0x1b, 0x24, 0x2b, 0x4d], // CNS 11643-1992 Plane 7
|
|
|
|
[0x1b, 0x4e], // SS2
|
|
|
|
[0x1b, 0x4f], // SS3
|
|
|
|
];
|
|
|
|
}
|