main class

This commit is contained in:
Dmitry Shirokov 2013-03-04 23:47:01 +04:00
parent a93a9cd557
commit dcaa9ba5d7
8 changed files with 109 additions and 28 deletions

View File

@ -98,7 +98,9 @@ ISO_2022.prototype.match = function(det) {
};
module.exports.ISO_2022_JP = function() {
this.name = "ISO-2022-JP";
this.name = function() {
return "ISO-2022-JP";
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
@ -119,7 +121,9 @@ util.inherits(module.exports.ISO_2022_JP, ISO_2022);
module.exports.ISO_2022_KR = function() {
this.name = "ISO-2022-KR";
this.name = function() {
return "ISO-2022-KR";
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x43 ]
];
@ -129,7 +133,9 @@ util.inherits(module.exports.ISO_2022_KR, ISO_2022);
module.exports.ISO_2022_CN = function() {
this.name = "ISO-2022-CN";
this.name = function() {
return "ISO-2022-CN";
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1

View File

@ -193,8 +193,12 @@ mbcs.prototype.nextChar = function(iter, det) {};
* Shift-JIS charset recognizer.
*/
module.exports.sjis = function() {
this.name = "Shift_JIS";
this.language = "ja";
this.name = function() {
return "Shift_JIS";
};
this.language = function() {
return "ja";
};
// TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved
@ -243,8 +247,12 @@ util.inherits(module.exports.sjis, mbcs);
* Big5 charset recognizer.
*/
module.exports.big5 = function() {
this.name = "Big5";
this.language = "zh";
this.name = function() {
return "Big5";
};
this.language = function() {
return "zh";
};
// TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved
// into a resource and loaded from there.
@ -363,8 +371,12 @@ util.inherits(module.exports.euc, mbcs);
* is created and kept by the public CharsetDetector class
*/
module.exports.euc_jp = function() {
this.name = "EUC-JP";
this.language = "ja";
this.name = function() {
return "EUC-JP";
};
this.language = function() {
return "ja";
};
// TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved
@ -391,8 +403,12 @@ util.inherits(module.exports.euc_jp, module.exports.euc);
* is created and kept by the public CharsetDetector class
*/
module.exports.euc_kr = function() {
this.name = "EUC-KR";
this.language = "ko";
this.name = function() {
return "EUC-KR";
};
this.language = function() {
return "ko";
};
// TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved
@ -418,8 +434,12 @@ util.inherits(module.exports.euc_kr, module.exports.euc);
* GB-18030 recognizer. Uses simplified Chinese statistics.
*/
module.exports.gb_18030 = function() {
this.name = "GB18030";
this.language = "zh";
this.name = function() {
return "GB18030";
};
this.language = function() {
return "zh";
};
/*
* (non-Javadoc)
* Get the next character value for EUC based encodings.

View File

@ -151,7 +151,7 @@ sbcs.prototype.match = function(det) {
}
}
var name = this.getName(det);
var name = this.name(det);
return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang);
}
@ -260,7 +260,7 @@ module.exports.ISO_8859_1 = function() {
])
];
this.getName = function(det) {
this.name = function(det) {
if (typeof det == 'undefined')
return "ISO-8859-1";
return det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
@ -332,7 +332,7 @@ module.exports.ISO_8859_2 = function() {
])
];
this.getName = function(det) {
this.name = function(det) {
if (typeof det == 'undefined')
return "ISO-8859-2";
return det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
@ -384,11 +384,11 @@ module.exports.ISO_8859_5 = function() {
0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520
];
this.getName = function(det) {
this.name = function(det) {
return "ISO-8859-5";
};
this.getLanguage = function() {
this.language = function() {
return "ru";
};
};
@ -438,11 +438,11 @@ module.exports.ISO_8859_6 = function() {
0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620
];
this.getName = function(det) {
this.name = function(det) {
return "ISO-8859-6";
};
this.getLanguage = function() {
this.language = function() {
return "ar";
};
};
@ -492,13 +492,13 @@ module.exports.ISO_8859_7 = function() {
0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20
];
this.getName = function(det) {
this.name = function(det) {
if (typeof det == 'undefined')
return "ISO-8859-7";
return det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
};
this.getLanguage = function() {
this.language = function() {
return "el";
};
};

View File

@ -7,7 +7,9 @@ var util = require('util'),
*/
module.exports.UTF_16BE = function() {
this.name = "UTF-16BE";
this.name = function() {
return "UTF-16BE";
};
this.match = function(det) {
var input = det.fRawInput;
@ -22,7 +24,9 @@ module.exports.UTF_16BE = function() {
};
module.exports.UTF_16LE = function() {
this.name = "UTF-16LE";
this.name = function() {
return "UTF-16LE";
};
this.match = function(det) {
var input = det.fRawInput;
@ -88,7 +92,9 @@ UTF_32.prototype.match = function(det) {
};
module.exports.UTF_32BE = function() {
this.name = "UTF_32BE";
this.name = function() {
return "UTF_32BE";
};
this.getChar = function(input, index) {
return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
(input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
@ -97,7 +103,9 @@ module.exports.UTF_32BE = function() {
util.inherits(module.exports.UTF_32BE, UTF_32);
module.exports.UTF_32LE = function() {
this.name = "UTF_32LE";
this.name = function() {
return "UTF_32LE";
};
this.getChar = function(input, index) {
return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
(input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);

View File

@ -5,7 +5,9 @@ var Match = require ('../match');
* Charset recognizer for UTF-8
*/
module.exports = function() {
this.name = "UTF-8";
this.name = function() {
return "UTF-8";
};
this.match = function(det) {
var hasBOM = false,

44
index.js Normal file
View File

@ -0,0 +1,44 @@
var UTF_8 = require('./encoding/utf8');
var unicode = require('./encoding/unicode');
var mbcs = require('./encoding/mbcs');
var iso2022 = require('./encoding/iso2022');
var recognisers = [
new UTF_8,
new unicode.UTF_16BE,
new unicode.UTF_16LE,
new unicode.UTF_32BE,
new unicode.UTF_32LE,
new mbcs.sjis,
new mbcs.big5,
new mbcs.euc_jp,
new mbcs.euc_kr,
new mbcs.gb_18030,
new iso2022.ISO_2022_JP,
new iso2022.ISO_2022_KR,
new iso2022.ISO_2022_CN
];
module.exports.detect = function(buffer) {
var det = {
fRawInput: buffer,
fRawLength: buffer.length,
fInputBytes: buffer,
fInputLen: buffer.length
};
var matches = [];
for (var i = recognisers.length - 1; i >= 0; i--) {
var recogniser = recognisers[i];
matches.push(rec.match(det));
};
matches.sort(function(a, b) {
return a.confidence - b.confidence;
});
return matches.pop().name;
};

View File

@ -1,6 +1,7 @@
module.exports = function(det, rec, confidence) {
module.exports = function(det, rec, confidence, name, lang) {
// console.log(det, rec, confidence);
// this.res = 1;
this.confidence = confidence;
this.name = name || rec.getName();
}

0
test.js Normal file
View File