commit
b1761347d5
|
@ -0,0 +1,2 @@
|
||||||
|
lib
|
||||||
|
jest.config.js
|
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"root": true,
|
||||||
|
"parser": "@typescript-eslint/parser",
|
||||||
|
"plugins": ["@typescript-eslint"],
|
||||||
|
"extends": ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
|
||||||
|
"rules": {
|
||||||
|
"@typescript-eslint/no-unused-vars": ["warn", { "varsIgnorePattern": "_" }],
|
||||||
|
"@typescript-eslint/no-inferrable-types": ["off"]
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,8 +1,8 @@
|
||||||
language: node_js
|
language: node_js
|
||||||
node_js:
|
node_js:
|
||||||
- "8"
|
|
||||||
- "10"
|
- "10"
|
||||||
- "12"
|
- "12"
|
||||||
|
- "14"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
20
package.json
20
package.json
|
@ -14,7 +14,7 @@
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "rm -rf lib/* && tsc",
|
"build": "rm -rf lib/* && tsc",
|
||||||
"lint": "tslint -p tsconfig.json -c tslint.json",
|
"lint": "eslint . --ext .js,.jsx,.ts,.tsx",
|
||||||
"lint:types": "tsc --noEmit",
|
"lint:types": "tsc --noEmit",
|
||||||
"format": "prettier --write ./src/**/*.ts",
|
"format": "prettier --write ./src/**/*.ts",
|
||||||
"format:check": "prettier --list-different ./src/**/*.ts",
|
"format:check": "prettier --list-different ./src/**/*.ts",
|
||||||
|
@ -33,14 +33,16 @@
|
||||||
"test": "test"
|
"test": "test"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/jest": "^25.1.4",
|
"@types/jest": "^26.0.14",
|
||||||
"@types/node": "^13.9.5",
|
"@types/node": "^14.11.2",
|
||||||
"jest": "^25.2.4",
|
"@typescript-eslint/eslint-plugin": "^4.2.0",
|
||||||
"prettier": "^2.0.2",
|
"@typescript-eslint/parser": "^4.2.0",
|
||||||
"semantic-release": "^15.14.0",
|
"eslint": "^7.9.0",
|
||||||
"ts-jest": "^25.2.1",
|
"jest": "^26.4.2",
|
||||||
"tslint": "^6.1.0",
|
"prettier": "^2.1.2",
|
||||||
"typescript": "^3.8.3"
|
"semantic-release": "^17.1.2",
|
||||||
|
"ts-jest": "^26.4.0",
|
||||||
|
"typescript": "^4.0.3"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"encoding",
|
"encoding",
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import * as chardet from '..';
|
import * as chardet from '..';
|
||||||
|
|
||||||
describe('ISO-2022', () => {
|
describe('ISO-2022', () => {
|
||||||
var base = __dirname + '/../test/data/encodings';
|
const base = __dirname + '/../test/data/encodings';
|
||||||
|
|
||||||
it('should return ISO-2022-JP', () => {
|
it('should return ISO-2022-JP', () => {
|
||||||
expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP');
|
expect(chardet.detectFileSync(base + '/iso2022jp')).toBe('ISO-2022-JP');
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import { Context, Recogniser } from '.';
|
import { Context, Recogniser } from '.';
|
||||||
|
import match, { Match } from '../match';
|
||||||
var match = require('../match').default;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is a superclass for the individual detectors for
|
* This is a superclass for the individual detectors for
|
||||||
|
@ -15,7 +14,7 @@ class ISO_2022 implements Recogniser {
|
||||||
return 'ISO_2022';
|
return 'ISO_2022';
|
||||||
}
|
}
|
||||||
|
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
/**
|
/**
|
||||||
* Matching function shared among the 2022 detectors JP, CN and KR
|
* Matching function shared among the 2022 detectors JP, CN and KR
|
||||||
* Counts up the number of legal an unrecognized escape sequences in
|
* Counts up the number of legal an unrecognized escape sequences in
|
||||||
|
@ -29,16 +28,16 @@ class ISO_2022 implements Recogniser {
|
||||||
* @return match quality, in the range of 0-100.
|
* @return match quality, in the range of 0-100.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
var i, j;
|
let i, j;
|
||||||
var escN;
|
let escN;
|
||||||
var hits = 0;
|
let hits = 0;
|
||||||
var misses = 0;
|
let misses = 0;
|
||||||
var shifts = 0;
|
let shifts = 0;
|
||||||
var quality;
|
let quality;
|
||||||
|
|
||||||
// TODO: refactor me
|
// TODO: refactor me
|
||||||
var text = det.fInputBytes;
|
const text = det.fInputBytes;
|
||||||
var textLen = det.fInputLen;
|
const textLen = det.fInputLen;
|
||||||
|
|
||||||
scanInput: for (i = 0; i < textLen; i++) {
|
scanInput: for (i = 0; i < textLen; i++) {
|
||||||
if (text[i] == 0x1b) {
|
if (text[i] == 0x1b) {
|
||||||
|
@ -47,7 +46,7 @@ class ISO_2022 implements Recogniser {
|
||||||
escN < this.escapeSequences.length;
|
escN < this.escapeSequences.length;
|
||||||
escN++
|
escN++
|
||||||
) {
|
) {
|
||||||
var seq = this.escapeSequences[escN];
|
const seq = this.escapeSequences[escN];
|
||||||
|
|
||||||
if (textLen - i < seq.length) continue checkEscapes;
|
if (textLen - i < seq.length) continue checkEscapes;
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import * as chardet from '..';
|
import * as chardet from '..';
|
||||||
|
|
||||||
describe('Multibyte Character Sets', () => {
|
describe('Multibyte Character Sets', () => {
|
||||||
var base = __dirname + '/../test/data/encodings';
|
const base = __dirname + '/../test/data/encodings';
|
||||||
|
|
||||||
it('should return Shift_JIS', () => {
|
it('should return Shift_JIS', () => {
|
||||||
expect(chardet.detectFileSync(base + '/shiftjis')).toBe('Shift_JIS');
|
expect(chardet.detectFileSync(base + '/shiftjis')).toBe('Shift_JIS');
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import { Context, Recogniser } from '.';
|
import { Context, Recogniser } from '.';
|
||||||
var match = require('../match').default;
|
import match, { Match } from '../match';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Binary search implementation (recursive)
|
* Binary search implementation (recursive)
|
||||||
|
@ -18,7 +18,7 @@ function binarySearch(arr: number[], searchValue: number) {
|
||||||
There is a bug in the above line;
|
There is a bug in the above line;
|
||||||
Joshua Bloch suggests the following replacement:
|
Joshua Bloch suggests the following replacement:
|
||||||
*/
|
*/
|
||||||
var mid = Math.floor((left + right) >>> 1);
|
const mid = Math.floor((left + right) >>> 1);
|
||||||
if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);
|
if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);
|
||||||
|
|
||||||
if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);
|
if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);
|
||||||
|
@ -68,7 +68,7 @@ class IteratedChar {
|
||||||
this.done = true;
|
this.done = true;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
|
const byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
|
||||||
return byteValue;
|
return byteValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -97,15 +97,15 @@ class mbcs implements Recogniser {
|
||||||
* bits 0-7: the match confidence, ranging from 0-100
|
* bits 0-7: the match confidence, ranging from 0-100
|
||||||
* bits 8-15: The match reason, an enum-like value.
|
* bits 8-15: The match reason, an enum-like value.
|
||||||
*/
|
*/
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var singleByteCharCount = 0, //TODO Do we really need this?
|
let singleByteCharCount = 0, //TODO Do we really need this?
|
||||||
doubleByteCharCount = 0,
|
doubleByteCharCount = 0,
|
||||||
commonCharCount = 0,
|
commonCharCount = 0,
|
||||||
badCharCount = 0,
|
badCharCount = 0,
|
||||||
totalCharCount = 0,
|
totalCharCount = 0,
|
||||||
confidence = 0;
|
confidence = 0;
|
||||||
|
|
||||||
var iter = new IteratedChar();
|
const iter = new IteratedChar();
|
||||||
|
|
||||||
detectBlock: {
|
detectBlock: {
|
||||||
for (iter.reset(); this.nextChar(iter, det); ) {
|
for (iter.reset(); this.nextChar(iter, det); ) {
|
||||||
|
@ -113,7 +113,7 @@ class mbcs implements Recogniser {
|
||||||
if (iter.error) {
|
if (iter.error) {
|
||||||
badCharCount++;
|
badCharCount++;
|
||||||
} else {
|
} else {
|
||||||
var cv = iter.charValue & 0xffffffff;
|
const cv = iter.charValue & 0xffffffff;
|
||||||
|
|
||||||
if (cv <= 0xff) {
|
if (cv <= 0xff) {
|
||||||
singleByteCharCount++;
|
singleByteCharCount++;
|
||||||
|
@ -159,7 +159,7 @@ class mbcs implements Recogniser {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.commonChars == null) {
|
if (this.commonChars == null) {
|
||||||
// We have no statistics on frequently occuring characters.
|
// We have no statistics on frequently occurring characters.
|
||||||
// Assess confidence purely on having a reasonable number of
|
// Assess confidence purely on having a reasonable number of
|
||||||
// multi-byte characters (the more the better
|
// multi-byte characters (the more the better
|
||||||
confidence = 30 + doubleByteCharCount - 20 * badCharCount;
|
confidence = 30 + doubleByteCharCount - 20 * badCharCount;
|
||||||
|
@ -167,12 +167,9 @@ class mbcs implements Recogniser {
|
||||||
confidence = 100;
|
confidence = 100;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//
|
|
||||||
// Frequency of occurrence statistics exist.
|
// Frequency of occurrence statistics exist.
|
||||||
//
|
const maxVal = Math.log(doubleByteCharCount / 4);
|
||||||
// @ts-ignore
|
const scaleFactor = 90.0 / maxVal;
|
||||||
var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
|
|
||||||
var scaleFactor = 90.0 / maxVal;
|
|
||||||
confidence = Math.floor(
|
confidence = Math.floor(
|
||||||
Math.log(commonCharCount + 1) * scaleFactor + 10
|
Math.log(commonCharCount + 1) * scaleFactor + 10
|
||||||
);
|
);
|
||||||
|
@ -278,14 +275,13 @@ export class sjis extends mbcs {
|
||||||
iter.index = iter.nextIndex;
|
iter.index = iter.nextIndex;
|
||||||
iter.error = false;
|
iter.error = false;
|
||||||
|
|
||||||
var firstByte;
|
const firstByte = (iter.charValue = iter.nextByte(det));
|
||||||
firstByte = iter.charValue = iter.nextByte(det);
|
|
||||||
if (firstByte < 0) return false;
|
if (firstByte < 0) return false;
|
||||||
|
|
||||||
if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
|
if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
var secondByte = iter.nextByte(det);
|
const secondByte = iter.nextByte(det);
|
||||||
if (secondByte < 0) return false;
|
if (secondByte < 0) return false;
|
||||||
|
|
||||||
iter.charValue = (firstByte << 8) | secondByte;
|
iter.charValue = (firstByte << 8) | secondByte;
|
||||||
|
@ -418,14 +414,14 @@ export class big5 extends mbcs {
|
||||||
iter.index = iter.nextIndex;
|
iter.index = iter.nextIndex;
|
||||||
iter.error = false;
|
iter.error = false;
|
||||||
|
|
||||||
var firstByte = (iter.charValue = iter.nextByte(det));
|
const firstByte = (iter.charValue = iter.nextByte(det));
|
||||||
|
|
||||||
if (firstByte < 0) return false;
|
if (firstByte < 0) return false;
|
||||||
|
|
||||||
// single byte character.
|
// single byte character.
|
||||||
if (firstByte <= 0x7f || firstByte == 0xff) return true;
|
if (firstByte <= 0x7f || firstByte == 0xff) return true;
|
||||||
|
|
||||||
var secondByte = iter.nextByte(det);
|
const secondByte = iter.nextByte(det);
|
||||||
|
|
||||||
if (secondByte < 0) return false;
|
if (secondByte < 0) return false;
|
||||||
|
|
||||||
|
@ -450,9 +446,9 @@ export class big5 extends mbcs {
|
||||||
function eucNextChar(iter: IteratedChar, det: Context) {
|
function eucNextChar(iter: IteratedChar, det: Context) {
|
||||||
iter.index = iter.nextIndex;
|
iter.index = iter.nextIndex;
|
||||||
iter.error = false;
|
iter.error = false;
|
||||||
var firstByte = 0;
|
let firstByte = 0;
|
||||||
var secondByte = 0;
|
let secondByte = 0;
|
||||||
var thirdByte = 0;
|
let thirdByte = 0;
|
||||||
//int fourthByte = 0;
|
//int fourthByte = 0;
|
||||||
buildChar: {
|
buildChar: {
|
||||||
firstByte = iter.charValue = iter.nextByte(det);
|
firstByte = iter.charValue = iter.nextByte(det);
|
||||||
|
@ -763,10 +759,10 @@ export class gb_18030 extends mbcs {
|
||||||
nextChar(iter: IteratedChar, det: Context) {
|
nextChar(iter: IteratedChar, det: Context) {
|
||||||
iter.index = iter.nextIndex;
|
iter.index = iter.nextIndex;
|
||||||
iter.error = false;
|
iter.error = false;
|
||||||
var firstByte = 0;
|
let firstByte = 0;
|
||||||
var secondByte = 0;
|
let secondByte = 0;
|
||||||
var thirdByte = 0;
|
let thirdByte = 0;
|
||||||
var fourthByte = 0;
|
let fourthByte = 0;
|
||||||
buildChar: {
|
buildChar: {
|
||||||
firstByte = iter.charValue = iter.nextByte(det);
|
firstByte = iter.charValue = iter.nextByte(det);
|
||||||
if (firstByte < 0) {
|
if (firstByte < 0) {
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import * as chardet from '..';
|
import * as chardet from '..';
|
||||||
|
|
||||||
describe('Singlebyte Character Sets', () => {
|
describe('Singlebyte Character Sets', () => {
|
||||||
var base = __dirname + '/../test/data/encodings';
|
const base = __dirname + '/../test/data/encodings';
|
||||||
|
|
||||||
it('should return ISO-8859-1 (English)', () => {
|
it('should return ISO-8859-1 (English)', () => {
|
||||||
expect(chardet.detectFileSync(base + '/iso88591_en')).toBe('ISO-8859-1');
|
expect(chardet.detectFileSync(base + '/iso88591_en')).toBe('ISO-8859-1');
|
||||||
|
|
|
@ -1,13 +1,12 @@
|
||||||
import { Context, Recogniser } from '../encoding/index';
|
import { Context, Recogniser } from '../encoding/index';
|
||||||
|
import match, { Match } from '../match';
|
||||||
var match = require('../match').default;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class recognizes single-byte encodings. Because the encoding scheme is so
|
* This class recognizes single-byte encodings. Because the encoding scheme is so
|
||||||
* simple, language statistics are used to do the matching.
|
* simple, language statistics are used to do the matching.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
var N_GRAM_MASK = 0xffffff;
|
const N_GRAM_MASK = 0xffffff;
|
||||||
|
|
||||||
class NGramParser {
|
class NGramParser {
|
||||||
byteIndex: number = 0;
|
byteIndex: number = 0;
|
||||||
|
@ -31,7 +30,7 @@ class NGramParser {
|
||||||
* Binary search for value in table, which must have exactly 64 entries.
|
* Binary search for value in table, which must have exactly 64 entries.
|
||||||
*/
|
*/
|
||||||
search(table: number[], value: number) {
|
search(table: number[], value: number) {
|
||||||
var index = 0;
|
let index = 0;
|
||||||
|
|
||||||
if (table[index + 32] <= value) index += 32;
|
if (table[index + 32] <= value) index += 32;
|
||||||
if (table[index + 16] <= value) index += 16;
|
if (table[index + 16] <= value) index += 16;
|
||||||
|
@ -65,12 +64,12 @@ class NGramParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
parse(det: Context, spaceCh: number) {
|
parse(det: Context, spaceCh: number) {
|
||||||
var b,
|
let b,
|
||||||
ignoreSpace = false;
|
ignoreSpace = false;
|
||||||
this.spaceChar = spaceCh;
|
this.spaceChar = spaceCh;
|
||||||
|
|
||||||
while ((b = this.nextByte(det)) >= 0) {
|
while ((b = this.nextByte(det)) >= 0) {
|
||||||
var mb = this.byteMap[b];
|
const mb = this.byteMap[b];
|
||||||
|
|
||||||
// TODO: 0x20 might not be a space in all character sets...
|
// TODO: 0x20 might not be a space in all character sets...
|
||||||
if (mb != 0) {
|
if (mb != 0) {
|
||||||
|
@ -85,7 +84,7 @@ class NGramParser {
|
||||||
// TODO: Is this OK? The buffer could have ended in the middle of a word...
|
// TODO: Is this OK? The buffer could have ended in the middle of a word...
|
||||||
this.addByte(this.spaceChar);
|
this.addByte(this.spaceChar);
|
||||||
|
|
||||||
var rawPercent = this.hitCount / this.ngramCount;
|
const rawPercent = this.hitCount / this.ngramCount;
|
||||||
|
|
||||||
// TODO - This is a bit of a hack to take care of a case
|
// TODO - This is a bit of a hack to take care of a case
|
||||||
// were we were getting a confidence of 135...
|
// were we were getting a confidence of 135...
|
||||||
|
@ -119,35 +118,34 @@ class sbcs implements Recogniser {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// @ts-ignore
|
|
||||||
name(input: Context): string {
|
name(input: Context): string {
|
||||||
return 'sbcs';
|
return 'sbcs';
|
||||||
}
|
}
|
||||||
|
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var ngrams = this.ngrams();
|
const ngrams = this.ngrams();
|
||||||
|
|
||||||
if (isFlatNgrams(ngrams)) {
|
if (isFlatNgrams(ngrams)) {
|
||||||
var parser = new NGramParser(ngrams, this.byteMap());
|
const parser = new NGramParser(ngrams, this.byteMap());
|
||||||
var confidence = parser.parse(det, this.spaceChar);
|
const confidence = parser.parse(det, this.spaceChar);
|
||||||
return confidence <= 0 ? null : match(det, this, confidence);
|
return confidence <= 0 ? null : match(det, this, confidence);
|
||||||
}
|
}
|
||||||
|
|
||||||
var bestConfidenceSoFar = -1;
|
let bestConfidenceSoFar = -1;
|
||||||
var lang = null;
|
let lang;
|
||||||
|
|
||||||
for (var i = ngrams.length - 1; i >= 0; i--) {
|
for (let i = ngrams.length - 1; i >= 0; i--) {
|
||||||
var ngl = ngrams[i];
|
const ngl = ngrams[i];
|
||||||
|
|
||||||
var parser = new NGramParser(ngl.fNGrams, this.byteMap());
|
const parser = new NGramParser(ngl.fNGrams, this.byteMap());
|
||||||
var confidence = parser.parse(det, this.spaceChar);
|
const confidence = parser.parse(det, this.spaceChar);
|
||||||
if (confidence > bestConfidenceSoFar) {
|
if (confidence > bestConfidenceSoFar) {
|
||||||
bestConfidenceSoFar = confidence;
|
bestConfidenceSoFar = confidence;
|
||||||
lang = ngl.fLang;
|
lang = ngl.fLang;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var name = this.name(det);
|
const name = this.name(det);
|
||||||
return bestConfidenceSoFar <= 0
|
return bestConfidenceSoFar <= 0
|
||||||
? null
|
? null
|
||||||
: match(det, this, bestConfidenceSoFar, name, lang);
|
: match(det, this, bestConfidenceSoFar, name, lang);
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import * as chardet from '..';
|
import * as chardet from '..';
|
||||||
|
|
||||||
describe('Unicode', () => {
|
describe('Unicode', () => {
|
||||||
var base = __dirname + '/../test/data/encodings';
|
const base = __dirname + '/../test/data/encodings';
|
||||||
|
|
||||||
it('should return UTF-16LE', () => {
|
it('should return UTF-16LE', () => {
|
||||||
expect(chardet.detectFileSync(base + '/utf16le')).toBe('UTF-16LE');
|
expect(chardet.detectFileSync(base + '/utf16le')).toBe('UTF-16LE');
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import { Context, Recogniser } from '.';
|
import { Context, Recogniser } from '.';
|
||||||
const match = require('../match').default;
|
import match, { Match } from '../match';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class matches UTF-16 and UTF-32, both big- and little-endian. The
|
* This class matches UTF-16 and UTF-32, both big- and little-endian. The
|
||||||
|
@ -10,8 +10,8 @@ export class UTF_16BE implements Recogniser {
|
||||||
return 'UTF-16BE';
|
return 'UTF-16BE';
|
||||||
}
|
}
|
||||||
|
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var input = det.fRawInput;
|
const input = det.fRawInput;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
input.length >= 2 &&
|
input.length >= 2 &&
|
||||||
|
@ -30,8 +30,8 @@ export class UTF_16LE implements Recogniser {
|
||||||
name() {
|
name() {
|
||||||
return 'UTF-16LE';
|
return 'UTF-16LE';
|
||||||
}
|
}
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var input = det.fRawInput;
|
const input = det.fRawInput;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
input.length >= 2 &&
|
input.length >= 2 &&
|
||||||
|
@ -64,13 +64,13 @@ class UTF_32 implements Recogniser, WithGetChar {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var input = det.fRawInput,
|
let numValid = 0,
|
||||||
limit = (det.fRawLength / 4) * 4,
|
|
||||||
numValid = 0,
|
|
||||||
numInvalid = 0,
|
numInvalid = 0,
|
||||||
hasBOM = false,
|
hasBOM = false,
|
||||||
confidence = 0;
|
confidence = 0;
|
||||||
|
const limit = (det.fRawLength / 4) * 4;
|
||||||
|
const input = det.fRawInput;
|
||||||
|
|
||||||
if (limit == 0) {
|
if (limit == 0) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -80,8 +80,8 @@ class UTF_32 implements Recogniser, WithGetChar {
|
||||||
hasBOM = true;
|
hasBOM = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var i = 0; i < limit; i += 4) {
|
for (let i = 0; i < limit; i += 4) {
|
||||||
var ch = this.getChar(input, i);
|
const ch = this.getChar(input, i);
|
||||||
|
|
||||||
if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) {
|
if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) {
|
||||||
numInvalid += 1;
|
numInvalid += 1;
|
||||||
|
|
|
@ -1,19 +1,18 @@
|
||||||
import { Context, Recogniser } from '.';
|
import { Context, Recogniser } from '.';
|
||||||
|
import match, { Match } from '../match';
|
||||||
var match = require('../match').default;
|
|
||||||
|
|
||||||
export default class Utf8 implements Recogniser {
|
export default class Utf8 implements Recogniser {
|
||||||
name() {
|
name() {
|
||||||
return 'UTF-8';
|
return 'UTF-8';
|
||||||
}
|
}
|
||||||
|
|
||||||
match(det: Context) {
|
match(det: Context): Match | null {
|
||||||
var hasBOM = false,
|
let hasBOM = false,
|
||||||
numValid = 0,
|
numValid = 0,
|
||||||
numInvalid = 0,
|
numInvalid = 0,
|
||||||
input = det.fRawInput,
|
|
||||||
trailBytes = 0,
|
trailBytes = 0,
|
||||||
confidence;
|
confidence;
|
||||||
|
const input = det.fRawInput;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
det.fRawLength >= 3 &&
|
det.fRawLength >= 3 &&
|
||||||
|
@ -25,8 +24,8 @@ export default class Utf8 implements Recogniser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scan for multi-byte sequences
|
// Scan for multi-byte sequences
|
||||||
for (var i = 0; i < det.fRawLength; i++) {
|
for (let i = 0; i < det.fRawLength; i++) {
|
||||||
var b = input[i];
|
const b = input[i];
|
||||||
if ((b & 0x80) == 0) continue; // ASCII
|
if ((b & 0x80) == 0) continue; // ASCII
|
||||||
|
|
||||||
// Hi bit on char found. Figure out how long the sequence should be
|
// Hi bit on char found. Figure out how long the sequence should be
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
|
import { Context, Recogniser } from "./encoding";
|
||||||
|
|
||||||
export interface Match {
|
export interface Match {
|
||||||
confidence: number;
|
confidence: number;
|
||||||
name: string;
|
name: string;
|
||||||
lang: string;
|
lang?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
// @ts-ignore
|
export default (det: Context, rec: Recogniser, confidence: number, name?: string, lang?: string): Match => ({
|
||||||
export default (det, rec, confidence, name, lang): Match => ({
|
|
||||||
confidence,
|
confidence,
|
||||||
name: name || rec.name(det),
|
name: name || rec.name(det),
|
||||||
lang,
|
lang,
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
"removeComments": true,
|
"removeComments": true,
|
||||||
"sourceMap": true,
|
"sourceMap": true,
|
||||||
"strict": true,
|
"strict": true,
|
||||||
"target": "esnext"
|
"target": "ES2019"
|
||||||
},
|
},
|
||||||
"exclude": ["node_modules", "**/*.spec.ts", "**/*.test.ts", "__mocks__"]
|
"exclude": ["node_modules", "**/*.spec.ts", "**/*.test.ts", "__mocks__", "lib"]
|
||||||
}
|
}
|
||||||
|
|
23
tslint.json
23
tslint.json
|
@ -1,23 +0,0 @@
|
||||||
{
|
|
||||||
"extends": "tslint:recommended",
|
|
||||||
"rules": {
|
|
||||||
"interface-name": [true, "never-prefix"],
|
|
||||||
"quotemark": [true, "single"],
|
|
||||||
"no-bitwise": false,
|
|
||||||
"trailing-comma": [
|
|
||||||
true,
|
|
||||||
{
|
|
||||||
"multiline": {
|
|
||||||
"objects": "always",
|
|
||||||
"arrays": "always",
|
|
||||||
"functions": "never",
|
|
||||||
"typeLiterals": "ignore"
|
|
||||||
},
|
|
||||||
"esSpecCompliant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"object-literal-sort-keys": false,
|
|
||||||
"radix": false,
|
|
||||||
"forin": false
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue