main class
This commit is contained in:
parent
a93a9cd557
commit
dcaa9ba5d7
|
@ -98,7 +98,9 @@ ISO_2022.prototype.match = function(det) {
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports.ISO_2022_JP = function() {
|
module.exports.ISO_2022_JP = function() {
|
||||||
this.name = "ISO-2022-JP";
|
this.name = function() {
|
||||||
|
return "ISO-2022-JP";
|
||||||
|
};
|
||||||
this.escapeSequences = [
|
this.escapeSequences = [
|
||||||
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
|
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
|
||||||
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
|
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
|
||||||
|
@ -119,7 +121,9 @@ util.inherits(module.exports.ISO_2022_JP, ISO_2022);
|
||||||
|
|
||||||
|
|
||||||
module.exports.ISO_2022_KR = function() {
|
module.exports.ISO_2022_KR = function() {
|
||||||
this.name = "ISO-2022-KR";
|
this.name = function() {
|
||||||
|
return "ISO-2022-KR";
|
||||||
|
};
|
||||||
this.escapeSequences = [
|
this.escapeSequences = [
|
||||||
[ 0x1b, 0x24, 0x29, 0x43 ]
|
[ 0x1b, 0x24, 0x29, 0x43 ]
|
||||||
];
|
];
|
||||||
|
@ -129,7 +133,9 @@ util.inherits(module.exports.ISO_2022_KR, ISO_2022);
|
||||||
|
|
||||||
|
|
||||||
module.exports.ISO_2022_CN = function() {
|
module.exports.ISO_2022_CN = function() {
|
||||||
this.name = "ISO-2022-CN";
|
this.name = function() {
|
||||||
|
return "ISO-2022-CN";
|
||||||
|
};
|
||||||
this.escapeSequences = [
|
this.escapeSequences = [
|
||||||
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
|
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
|
||||||
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
|
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
|
||||||
|
|
|
@ -193,8 +193,12 @@ mbcs.prototype.nextChar = function(iter, det) {};
|
||||||
* Shift-JIS charset recognizer.
|
* Shift-JIS charset recognizer.
|
||||||
*/
|
*/
|
||||||
module.exports.sjis = function() {
|
module.exports.sjis = function() {
|
||||||
this.name = "Shift_JIS";
|
this.name = function() {
|
||||||
this.language = "ja";
|
return "Shift_JIS";
|
||||||
|
};
|
||||||
|
this.language = function() {
|
||||||
|
return "ja";
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
// of-occurence analysis tool. The data needs to be moved
|
// of-occurence analysis tool. The data needs to be moved
|
||||||
|
@ -243,8 +247,12 @@ util.inherits(module.exports.sjis, mbcs);
|
||||||
* Big5 charset recognizer.
|
* Big5 charset recognizer.
|
||||||
*/
|
*/
|
||||||
module.exports.big5 = function() {
|
module.exports.big5 = function() {
|
||||||
this.name = "Big5";
|
this.name = function() {
|
||||||
this.language = "zh";
|
return "Big5";
|
||||||
|
};
|
||||||
|
this.language = function() {
|
||||||
|
return "zh";
|
||||||
|
};
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
// of-occurence analysis tool. The data needs to be moved
|
// of-occurence analysis tool. The data needs to be moved
|
||||||
// into a resource and loaded from there.
|
// into a resource and loaded from there.
|
||||||
|
@ -363,8 +371,12 @@ util.inherits(module.exports.euc, mbcs);
|
||||||
* is created and kept by the public CharsetDetector class
|
* is created and kept by the public CharsetDetector class
|
||||||
*/
|
*/
|
||||||
module.exports.euc_jp = function() {
|
module.exports.euc_jp = function() {
|
||||||
this.name = "EUC-JP";
|
this.name = function() {
|
||||||
this.language = "ja";
|
return "EUC-JP";
|
||||||
|
};
|
||||||
|
this.language = function() {
|
||||||
|
return "ja";
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
// of-occurence analysis tool. The data needs to be moved
|
// of-occurence analysis tool. The data needs to be moved
|
||||||
|
@ -391,8 +403,12 @@ util.inherits(module.exports.euc_jp, module.exports.euc);
|
||||||
* is created and kept by the public CharsetDetector class
|
* is created and kept by the public CharsetDetector class
|
||||||
*/
|
*/
|
||||||
module.exports.euc_kr = function() {
|
module.exports.euc_kr = function() {
|
||||||
this.name = "EUC-KR";
|
this.name = function() {
|
||||||
this.language = "ko";
|
return "EUC-KR";
|
||||||
|
};
|
||||||
|
this.language = function() {
|
||||||
|
return "ko";
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
// of-occurence analysis tool. The data needs to be moved
|
// of-occurence analysis tool. The data needs to be moved
|
||||||
|
@ -418,8 +434,12 @@ util.inherits(module.exports.euc_kr, module.exports.euc);
|
||||||
* GB-18030 recognizer. Uses simplified Chinese statistics.
|
* GB-18030 recognizer. Uses simplified Chinese statistics.
|
||||||
*/
|
*/
|
||||||
module.exports.gb_18030 = function() {
|
module.exports.gb_18030 = function() {
|
||||||
this.name = "GB18030";
|
this.name = function() {
|
||||||
this.language = "zh";
|
return "GB18030";
|
||||||
|
};
|
||||||
|
this.language = function() {
|
||||||
|
return "zh";
|
||||||
|
};
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
* Get the next character value for EUC based encodings.
|
* Get the next character value for EUC based encodings.
|
||||||
|
|
|
@ -151,7 +151,7 @@ sbcs.prototype.match = function(det) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var name = this.getName(det);
|
var name = this.name(det);
|
||||||
return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang);
|
return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -260,7 +260,7 @@ module.exports.ISO_8859_1 = function() {
|
||||||
])
|
])
|
||||||
];
|
];
|
||||||
|
|
||||||
this.getName = function(det) {
|
this.name = function(det) {
|
||||||
if (typeof det == 'undefined')
|
if (typeof det == 'undefined')
|
||||||
return "ISO-8859-1";
|
return "ISO-8859-1";
|
||||||
return det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
|
return det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
|
||||||
|
@ -332,7 +332,7 @@ module.exports.ISO_8859_2 = function() {
|
||||||
])
|
])
|
||||||
];
|
];
|
||||||
|
|
||||||
this.getName = function(det) {
|
this.name = function(det) {
|
||||||
if (typeof det == 'undefined')
|
if (typeof det == 'undefined')
|
||||||
return "ISO-8859-2";
|
return "ISO-8859-2";
|
||||||
return det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
|
return det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
|
||||||
|
@ -384,11 +384,11 @@ module.exports.ISO_8859_5 = function() {
|
||||||
0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520
|
0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520
|
||||||
];
|
];
|
||||||
|
|
||||||
this.getName = function(det) {
|
this.name = function(det) {
|
||||||
return "ISO-8859-5";
|
return "ISO-8859-5";
|
||||||
};
|
};
|
||||||
|
|
||||||
this.getLanguage = function() {
|
this.language = function() {
|
||||||
return "ru";
|
return "ru";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -438,11 +438,11 @@ module.exports.ISO_8859_6 = function() {
|
||||||
0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620
|
0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620
|
||||||
];
|
];
|
||||||
|
|
||||||
this.getName = function(det) {
|
this.name = function(det) {
|
||||||
return "ISO-8859-6";
|
return "ISO-8859-6";
|
||||||
};
|
};
|
||||||
|
|
||||||
this.getLanguage = function() {
|
this.language = function() {
|
||||||
return "ar";
|
return "ar";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -492,13 +492,13 @@ module.exports.ISO_8859_7 = function() {
|
||||||
0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20
|
0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20
|
||||||
];
|
];
|
||||||
|
|
||||||
this.getName = function(det) {
|
this.name = function(det) {
|
||||||
if (typeof det == 'undefined')
|
if (typeof det == 'undefined')
|
||||||
return "ISO-8859-7";
|
return "ISO-8859-7";
|
||||||
return det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
|
return det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
|
||||||
};
|
};
|
||||||
|
|
||||||
this.getLanguage = function() {
|
this.language = function() {
|
||||||
return "el";
|
return "el";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -7,7 +7,9 @@ var util = require('util'),
|
||||||
*/
|
*/
|
||||||
|
|
||||||
module.exports.UTF_16BE = function() {
|
module.exports.UTF_16BE = function() {
|
||||||
this.name = "UTF-16BE";
|
this.name = function() {
|
||||||
|
return "UTF-16BE";
|
||||||
|
};
|
||||||
this.match = function(det) {
|
this.match = function(det) {
|
||||||
var input = det.fRawInput;
|
var input = det.fRawInput;
|
||||||
|
|
||||||
|
@ -22,7 +24,9 @@ module.exports.UTF_16BE = function() {
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports.UTF_16LE = function() {
|
module.exports.UTF_16LE = function() {
|
||||||
this.name = "UTF-16LE";
|
this.name = function() {
|
||||||
|
return "UTF-16LE";
|
||||||
|
};
|
||||||
this.match = function(det) {
|
this.match = function(det) {
|
||||||
var input = det.fRawInput;
|
var input = det.fRawInput;
|
||||||
|
|
||||||
|
@ -88,7 +92,9 @@ UTF_32.prototype.match = function(det) {
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports.UTF_32BE = function() {
|
module.exports.UTF_32BE = function() {
|
||||||
this.name = "UTF_32BE";
|
this.name = function() {
|
||||||
|
return "UTF_32BE";
|
||||||
|
};
|
||||||
this.getChar = function(input, index) {
|
this.getChar = function(input, index) {
|
||||||
return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
|
return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
|
||||||
(input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
|
(input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
|
||||||
|
@ -97,7 +103,9 @@ module.exports.UTF_32BE = function() {
|
||||||
util.inherits(module.exports.UTF_32BE, UTF_32);
|
util.inherits(module.exports.UTF_32BE, UTF_32);
|
||||||
|
|
||||||
module.exports.UTF_32LE = function() {
|
module.exports.UTF_32LE = function() {
|
||||||
this.name = "UTF_32LE";
|
this.name = function() {
|
||||||
|
return "UTF_32LE";
|
||||||
|
};
|
||||||
this.getChar = function(input, index) {
|
this.getChar = function(input, index) {
|
||||||
return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
|
return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
|
||||||
(input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
|
(input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
|
||||||
|
|
|
@ -5,7 +5,9 @@ var Match = require ('../match');
|
||||||
* Charset recognizer for UTF-8
|
* Charset recognizer for UTF-8
|
||||||
*/
|
*/
|
||||||
module.exports = function() {
|
module.exports = function() {
|
||||||
this.name = "UTF-8";
|
this.name = function() {
|
||||||
|
return "UTF-8";
|
||||||
|
};
|
||||||
this.match = function(det) {
|
this.match = function(det) {
|
||||||
|
|
||||||
var hasBOM = false,
|
var hasBOM = false,
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
|
||||||
|
|
||||||
|
var UTF_8 = require('./encoding/utf8');
|
||||||
|
var unicode = require('./encoding/unicode');
|
||||||
|
var mbcs = require('./encoding/mbcs');
|
||||||
|
var iso2022 = require('./encoding/iso2022');
|
||||||
|
|
||||||
|
var recognisers = [
|
||||||
|
new UTF_8,
|
||||||
|
new unicode.UTF_16BE,
|
||||||
|
new unicode.UTF_16LE,
|
||||||
|
new unicode.UTF_32BE,
|
||||||
|
new unicode.UTF_32LE,
|
||||||
|
new mbcs.sjis,
|
||||||
|
new mbcs.big5,
|
||||||
|
new mbcs.euc_jp,
|
||||||
|
new mbcs.euc_kr,
|
||||||
|
new mbcs.gb_18030,
|
||||||
|
new iso2022.ISO_2022_JP,
|
||||||
|
new iso2022.ISO_2022_KR,
|
||||||
|
new iso2022.ISO_2022_CN
|
||||||
|
];
|
||||||
|
|
||||||
|
module.exports.detect = function(buffer) {
|
||||||
|
|
||||||
|
var det = {
|
||||||
|
fRawInput: buffer,
|
||||||
|
fRawLength: buffer.length,
|
||||||
|
fInputBytes: buffer,
|
||||||
|
fInputLen: buffer.length
|
||||||
|
};
|
||||||
|
|
||||||
|
var matches = [];
|
||||||
|
for (var i = recognisers.length - 1; i >= 0; i--) {
|
||||||
|
var recogniser = recognisers[i];
|
||||||
|
matches.push(rec.match(det));
|
||||||
|
};
|
||||||
|
|
||||||
|
matches.sort(function(a, b) {
|
||||||
|
return a.confidence - b.confidence;
|
||||||
|
});
|
||||||
|
|
||||||
|
return matches.pop().name;
|
||||||
|
};
|
3
match.js
3
match.js
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
module.exports = function(det, rec, confidence) {
|
module.exports = function(det, rec, confidence, name, lang) {
|
||||||
// console.log(det, rec, confidence);
|
// console.log(det, rec, confidence);
|
||||||
// this.res = 1;
|
// this.res = 1;
|
||||||
this.confidence = confidence;
|
this.confidence = confidence;
|
||||||
|
this.name = name || rec.getName();
|
||||||
}
|
}
|
Loading…
Reference in New Issue