chardet/src/encoding/mbcs.ts

912 lines
18 KiB
TypeScript

import { Context, Recogniser } from '.';
import match, { Match } from '../match';
/**
* Binary search implementation (recursive)
*/
function binarySearch(arr: number[], searchValue: number) {
const find = (
arr: number[],
searchValue: number,
left: number,
right: number
): number => {
if (right < left) return -1;
/*
int mid = mid = (left + right) / 2;
There is a bug in the above line;
Joshua Bloch suggests the following replacement:
*/
const mid = Math.floor((left + right) >>> 1);
if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);
if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);
return mid;
};
return find(arr, searchValue, 0, arr.length - 1);
}
// 'Character' iterated character class.
// Recognizers for specific mbcs encodings make their 'characters' available
// by providing a nextChar() function that fills in an instance of iteratedChar
// with the next char from the input.
// The returned characters are not converted to Unicode, but remain as the raw
// bytes (concatenated into an int) from the codepage data.
//
// For Asian charsets, use the raw input rather than the input that has been
// stripped of markup. Detection only considers multi-byte chars, effectively
// stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar {
charValue: number; // 1-4 bytes from the raw input data
index: number;
nextIndex: number;
error: boolean;
done: boolean;
constructor() {
this.charValue = 0; // 1-4 bytes from the raw input data
this.index = 0;
this.nextIndex = 0;
this.error = false;
this.done = false;
}
reset() {
this.charValue = 0;
this.index = -1;
this.nextIndex = 0;
this.error = false;
this.done = false;
}
nextByte(det: Context) {
if (this.nextIndex >= det.fRawLength) {
this.done = true;
return -1;
}
const byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
return byteValue;
}
}
/**
* Asian double or multi-byte - charsets.
* Match is determined mostly by the input data adhering to the
* encoding scheme for the charset, and, optionally,
* frequency-of-occurrence of characters.
*/
class mbcs implements Recogniser {
commonChars: number[] = [];
name() {
return 'mbcs';
}
/**
* Test the match of this charset with the input text data
* which is obtained via the CharsetDetector object.
*
* @param det The CharsetDetector, which contains the input text
* to be checked for being in this charset.
* @return Two values packed into one int (Damn java, anyhow)
* bits 0-7: the match confidence, ranging from 0-100
* bits 8-15: The match reason, an enum-like value.
*/
match(det: Context): Match | null {
let singleByteCharCount = 0, //TODO Do we really need this?
doubleByteCharCount = 0,
commonCharCount = 0,
badCharCount = 0,
totalCharCount = 0,
confidence = 0;
const iter = new IteratedChar();
detectBlock: {
for (iter.reset(); this.nextChar(iter, det); ) {
totalCharCount++;
if (iter.error) {
badCharCount++;
} else {
const cv = iter.charValue & 0xffffffff;
if (cv <= 0xff) {
singleByteCharCount++;
} else {
doubleByteCharCount++;
if (this.commonChars != null) {
// NOTE: This assumes that there are no 4-byte common chars.
if (binarySearch(this.commonChars, cv) >= 0) {
commonCharCount++;
}
}
}
}
if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
// console.log('its here!')
// Bail out early if the byte data is not matching the encoding scheme.
break detectBlock;
}
}
if (doubleByteCharCount <= 10 && badCharCount == 0) {
// Not many multi-byte chars.
if (doubleByteCharCount == 0 && totalCharCount < 10) {
// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
// We don't have enough data to have any confidence.
// Statistical analysis of single byte non-ASCII characters would probably help here.
confidence = 0;
} else {
// ASCII or ISO file? It's probably not our encoding,
// but is not incompatible with our encoding, so don't give it a zero.
confidence = 10;
}
break detectBlock;
}
//
// No match if there are too many characters that don't fit the encoding scheme.
// (should we have zero tolerance for these?)
//
if (doubleByteCharCount < 20 * badCharCount) {
confidence = 0;
break detectBlock;
}
if (this.commonChars == null) {
// We have no statistics on frequently occurring characters.
// Assess confidence purely on having a reasonable number of
// multi-byte characters (the more the better
confidence = 30 + doubleByteCharCount - 20 * badCharCount;
if (confidence > 100) {
confidence = 100;
}
} else {
// Frequency of occurrence statistics exist.
const maxVal = Math.log(doubleByteCharCount / 4);
const scaleFactor = 90.0 / maxVal;
confidence = Math.floor(
Math.log(commonCharCount + 1) * scaleFactor + 10
);
confidence = Math.min(confidence, 100);
}
} // end of detectBlock:
return confidence == 0 ? null : match(det, this, confidence);
}
/**
* Get the next character (however many bytes it is) from the input data
* Subclasses for specific charset encodings must implement this function
* to get characters according to the rules of their encoding scheme.
*
* This function is not a method of class iteratedChar only because
* that would require a lot of extra derived classes, which is awkward.
* @param it The iteratedChar 'struct' into which the returned char is placed.
* @param det The charset detector, which is needed to get at the input byte data
* being iterated over.
* @return True if a character was returned, false at end of input.
*/
nextChar(iter: IteratedChar, det: Context): boolean {
return true;
}
}
/**
* Shift_JIS charset recognizer.
*/
export class sjis extends mbcs {
name() {
return 'Shift_JIS';
}
language() {
return 'ja';
}
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
commonChars = [
0x8140,
0x8141,
0x8142,
0x8145,
0x815b,
0x8169,
0x816a,
0x8175,
0x8176,
0x82a0,
0x82a2,
0x82a4,
0x82a9,
0x82aa,
0x82ab,
0x82ad,
0x82af,
0x82b1,
0x82b3,
0x82b5,
0x82b7,
0x82bd,
0x82be,
0x82c1,
0x82c4,
0x82c5,
0x82c6,
0x82c8,
0x82c9,
0x82cc,
0x82cd,
0x82dc,
0x82e0,
0x82e7,
0x82e8,
0x82e9,
0x82ea,
0x82f0,
0x82f1,
0x8341,
0x8343,
0x834e,
0x834f,
0x8358,
0x835e,
0x8362,
0x8367,
0x8375,
0x8376,
0x8389,
0x838a,
0x838b,
0x838d,
0x8393,
0x8e96,
0x93fa,
0x95aa,
];
nextChar(iter: IteratedChar, det: Context) {
iter.index = iter.nextIndex;
iter.error = false;
const firstByte = (iter.charValue = iter.nextByte(det));
if (firstByte < 0) return false;
if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
return true;
const secondByte = iter.nextByte(det);
if (secondByte < 0) return false;
iter.charValue = (firstByte << 8) | secondByte;
if (
!(
(secondByte >= 0x40 && secondByte <= 0x7f) ||
(secondByte >= 0x80 && secondByte <= 0xff)
)
) {
// Illegal second byte value.
iter.error = true;
}
return true;
}
}
/**
* Big5 charset recognizer.
*/
export class big5 extends mbcs {
name() {
return 'Big5';
}
language() {
return 'zh';
}
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
commonChars = [
0xa140,
0xa141,
0xa142,
0xa143,
0xa147,
0xa149,
0xa175,
0xa176,
0xa440,
0xa446,
0xa447,
0xa448,
0xa451,
0xa454,
0xa457,
0xa464,
0xa46a,
0xa46c,
0xa477,
0xa4a3,
0xa4a4,
0xa4a7,
0xa4c1,
0xa4ce,
0xa4d1,
0xa4df,
0xa4e8,
0xa4fd,
0xa540,
0xa548,
0xa558,
0xa569,
0xa5cd,
0xa5e7,
0xa657,
0xa661,
0xa662,
0xa668,
0xa670,
0xa6a8,
0xa6b3,
0xa6b9,
0xa6d3,
0xa6db,
0xa6e6,
0xa6f2,
0xa740,
0xa751,
0xa759,
0xa7da,
0xa8a3,
0xa8a5,
0xa8ad,
0xa8d1,
0xa8d3,
0xa8e4,
0xa8fc,
0xa9c0,
0xa9d2,
0xa9f3,
0xaa6b,
0xaaba,
0xaabe,
0xaacc,
0xaafc,
0xac47,
0xac4f,
0xacb0,
0xacd2,
0xad59,
0xaec9,
0xafe0,
0xb0ea,
0xb16f,
0xb2b3,
0xb2c4,
0xb36f,
0xb44c,
0xb44e,
0xb54c,
0xb5a5,
0xb5bd,
0xb5d0,
0xb5d8,
0xb671,
0xb7ed,
0xb867,
0xb944,
0xbad8,
0xbb44,
0xbba1,
0xbdd1,
0xc2c4,
0xc3b9,
0xc440,
0xc45f,
];
nextChar(iter: IteratedChar, det: Context) {
iter.index = iter.nextIndex;
iter.error = false;
const firstByte = (iter.charValue = iter.nextByte(det));
if (firstByte < 0) return false;
// single byte character.
if (firstByte <= 0x7f || firstByte == 0xff) return true;
const secondByte = iter.nextByte(det);
if (secondByte < 0) return false;
iter.charValue = (iter.charValue << 8) | secondByte;
if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
iter.error = true;
return true;
}
}
/**
* EUC charset recognizers. One abstract class that provides the common function
* for getting the next character according to the EUC encoding scheme,
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
*
* Get the next character value for EUC based encodings.
* Character 'value' is simply the raw bytes that make up the character
* packed into an int.
*/
function eucNextChar(iter: IteratedChar, det: Context) {
iter.index = iter.nextIndex;
iter.error = false;
let firstByte = 0;
let secondByte = 0;
let thirdByte = 0;
//int fourthByte = 0;
buildChar: {
firstByte = iter.charValue = iter.nextByte(det);
if (firstByte < 0) {
// Ran off the end of the input data
iter.done = true;
break buildChar;
}
if (firstByte <= 0x8d) {
// single byte char
break buildChar;
}
secondByte = iter.nextByte(det);
iter.charValue = (iter.charValue << 8) | secondByte;
if (firstByte >= 0xa1 && firstByte <= 0xfe) {
// Two byte Char
if (secondByte < 0xa1) {
iter.error = true;
}
break buildChar;
}
if (firstByte == 0x8e) {
// Code Set 2.
// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
// We don't know which we've got.
// Treat it like EUC-JP. If the data really was EUC-TW, the following two
// bytes will look like a well formed 2 byte char.
if (secondByte < 0xa1) {
iter.error = true;
}
break buildChar;
}
if (firstByte == 0x8f) {
// Code set 3.
// Three byte total char size, two bytes of actual char value.
thirdByte = iter.nextByte(det);
iter.charValue = (iter.charValue << 8) | thirdByte;
if (thirdByte < 0xa1) {
iter.error = true;
}
}
}
return iter.done == false;
}
/**
* The charset recognize for EUC-JP. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
export class euc_jp extends mbcs {
name() {
return 'EUC-JP';
}
language() {
return 'ja';
}
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
commonChars = [
0xa1a1,
0xa1a2,
0xa1a3,
0xa1a6,
0xa1bc,
0xa1ca,
0xa1cb,
0xa1d6,
0xa1d7,
0xa4a2,
0xa4a4,
0xa4a6,
0xa4a8,
0xa4aa,
0xa4ab,
0xa4ac,
0xa4ad,
0xa4af,
0xa4b1,
0xa4b3,
0xa4b5,
0xa4b7,
0xa4b9,
0xa4bb,
0xa4bd,
0xa4bf,
0xa4c0,
0xa4c1,
0xa4c3,
0xa4c4,
0xa4c6,
0xa4c7,
0xa4c8,
0xa4c9,
0xa4ca,
0xa4cb,
0xa4ce,
0xa4cf,
0xa4d0,
0xa4de,
0xa4df,
0xa4e1,
0xa4e2,
0xa4e4,
0xa4e8,
0xa4e9,
0xa4ea,
0xa4eb,
0xa4ec,
0xa4ef,
0xa4f2,
0xa4f3,
0xa5a2,
0xa5a3,
0xa5a4,
0xa5a6,
0xa5a7,
0xa5aa,
0xa5ad,
0xa5af,
0xa5b0,
0xa5b3,
0xa5b5,
0xa5b7,
0xa5b8,
0xa5b9,
0xa5bf,
0xa5c3,
0xa5c6,
0xa5c7,
0xa5c8,
0xa5c9,
0xa5cb,
0xa5d0,
0xa5d5,
0xa5d6,
0xa5d7,
0xa5de,
0xa5e0,
0xa5e1,
0xa5e5,
0xa5e9,
0xa5ea,
0xa5eb,
0xa5ec,
0xa5ed,
0xa5f3,
0xb8a9,
0xb9d4,
0xbaee,
0xbbc8,
0xbef0,
0xbfb7,
0xc4ea,
0xc6fc,
0xc7bd,
0xcab8,
0xcaf3,
0xcbdc,
0xcdd1,
];
nextChar = eucNextChar;
}
/**
* The charset recognize for EUC-KR. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
export class euc_kr extends mbcs {
name() {
return 'EUC-KR';
}
language() {
return 'ko';
}
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
commonChars = [
0xb0a1,
0xb0b3,
0xb0c5,
0xb0cd,
0xb0d4,
0xb0e6,
0xb0ed,
0xb0f8,
0xb0fa,
0xb0fc,
0xb1b8,
0xb1b9,
0xb1c7,
0xb1d7,
0xb1e2,
0xb3aa,
0xb3bb,
0xb4c2,
0xb4cf,
0xb4d9,
0xb4eb,
0xb5a5,
0xb5b5,
0xb5bf,
0xb5c7,
0xb5e9,
0xb6f3,
0xb7af,
0xb7c2,
0xb7ce,
0xb8a6,
0xb8ae,
0xb8b6,
0xb8b8,
0xb8bb,
0xb8e9,
0xb9ab,
0xb9ae,
0xb9cc,
0xb9ce,
0xb9fd,
0xbab8,
0xbace,
0xbad0,
0xbaf1,
0xbbe7,
0xbbf3,
0xbbfd,
0xbcad,
0xbcba,
0xbcd2,
0xbcf6,
0xbdba,
0xbdc0,
0xbdc3,
0xbdc5,
0xbec6,
0xbec8,
0xbedf,
0xbeee,
0xbef8,
0xbefa,
0xbfa1,
0xbfa9,
0xbfc0,
0xbfe4,
0xbfeb,
0xbfec,
0xbff8,
0xc0a7,
0xc0af,
0xc0b8,
0xc0ba,
0xc0bb,
0xc0bd,
0xc0c7,
0xc0cc,
0xc0ce,
0xc0cf,
0xc0d6,
0xc0da,
0xc0e5,
0xc0fb,
0xc0fc,
0xc1a4,
0xc1a6,
0xc1b6,
0xc1d6,
0xc1df,
0xc1f6,
0xc1f8,
0xc4a1,
0xc5cd,
0xc6ae,
0xc7cf,
0xc7d1,
0xc7d2,
0xc7d8,
0xc7e5,
0xc8ad,
];
nextChar = eucNextChar;
}
/**
* GB-18030 recognizer. Uses simplified Chinese statistics.
*/
export class gb_18030 extends mbcs {
name() {
return 'GB18030';
}
language() {
return 'zh';
}
/*
* Get the next character value for EUC based encodings.
* Character 'value' is simply the raw bytes that make up the character
* packed into an int.
*/
nextChar(iter: IteratedChar, det: Context) {
iter.index = iter.nextIndex;
iter.error = false;
let firstByte = 0;
let secondByte = 0;
let thirdByte = 0;
let fourthByte = 0;
buildChar: {
firstByte = iter.charValue = iter.nextByte(det);
if (firstByte < 0) {
// Ran off the end of the input data
iter.done = true;
break buildChar;
}
if (firstByte <= 0x80) {
// single byte char
break buildChar;
}
secondByte = iter.nextByte(det);
iter.charValue = (iter.charValue << 8) | secondByte;
if (firstByte >= 0x81 && firstByte <= 0xfe) {
// Two byte Char
if (
(secondByte >= 0x40 && secondByte <= 0x7e) ||
(secondByte >= 80 && secondByte <= 0xfe)
) {
break buildChar;
}
// Four byte char
if (secondByte >= 0x30 && secondByte <= 0x39) {
thirdByte = iter.nextByte(det);
if (thirdByte >= 0x81 && thirdByte <= 0xfe) {
fourthByte = iter.nextByte(det);
if (fourthByte >= 0x30 && fourthByte <= 0x39) {
iter.charValue =
(iter.charValue << 16) | (thirdByte << 8) | fourthByte;
break buildChar;
}
}
}
iter.error = true;
break buildChar;
}
}
return iter.done == false;
}
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
commonChars = [
0xa1a1,
0xa1a2,
0xa1a3,
0xa1a4,
0xa1b0,
0xa1b1,
0xa1f1,
0xa1f3,
0xa3a1,
0xa3ac,
0xa3ba,
0xb1a8,
0xb1b8,
0xb1be,
0xb2bb,
0xb3c9,
0xb3f6,
0xb4f3,
0xb5bd,
0xb5c4,
0xb5e3,
0xb6af,
0xb6d4,
0xb6e0,
0xb7a2,
0xb7a8,
0xb7bd,
0xb7d6,
0xb7dd,
0xb8b4,
0xb8df,
0xb8f6,
0xb9ab,
0xb9c9,
0xb9d8,
0xb9fa,
0xb9fd,
0xbacd,
0xbba7,
0xbbd6,
0xbbe1,
0xbbfa,
0xbcbc,
0xbcdb,
0xbcfe,
0xbdcc,
0xbecd,
0xbedd,
0xbfb4,
0xbfc6,
0xbfc9,
0xc0b4,
0xc0ed,
0xc1cb,
0xc2db,
0xc3c7,
0xc4dc,
0xc4ea,
0xc5cc,
0xc6f7,
0xc7f8,
0xc8ab,
0xc8cb,
0xc8d5,
0xc8e7,
0xc9cf,
0xc9fa,
0xcab1,
0xcab5,
0xcac7,
0xcad0,
0xcad6,
0xcaf5,
0xcafd,
0xccec,
0xcdf8,
0xceaa,
0xcec4,
0xced2,
0xcee5,
0xcfb5,
0xcfc2,
0xcfd6,
0xd0c2,
0xd0c5,
0xd0d0,
0xd0d4,
0xd1a7,
0xd2aa,
0xd2b2,
0xd2b5,
0xd2bb,
0xd2d4,
0xd3c3,
0xd3d0,
0xd3fd,
0xd4c2,
0xd4da,
0xd5e2,
0xd6d0,
];
}