main class

This commit is contained in:
Dmitry Shirokov 2013-03-04 23:47:01 +04:00
parent a93a9cd557
commit dcaa9ba5d7
8 changed files with 109 additions and 28 deletions

View File

@ -98,7 +98,9 @@ ISO_2022.prototype.match = function(det) {
}; };
module.exports.ISO_2022_JP = function() { module.exports.ISO_2022_JP = function() {
this.name = "ISO-2022-JP"; this.name = function() {
return "ISO-2022-JP";
};
this.escapeSequences = [ this.escapeSequences = [
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992 [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990 [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
@ -119,7 +121,9 @@ util.inherits(module.exports.ISO_2022_JP, ISO_2022);
module.exports.ISO_2022_KR = function() { module.exports.ISO_2022_KR = function() {
this.name = "ISO-2022-KR"; this.name = function() {
return "ISO-2022-KR";
};
this.escapeSequences = [ this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x43 ] [ 0x1b, 0x24, 0x29, 0x43 ]
]; ];
@ -129,7 +133,9 @@ util.inherits(module.exports.ISO_2022_KR, ISO_2022);
module.exports.ISO_2022_CN = function() { module.exports.ISO_2022_CN = function() {
this.name = "ISO-2022-CN"; this.name = function() {
return "ISO-2022-CN";
};
this.escapeSequences = [ this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80 [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1 [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1

View File

@ -193,8 +193,12 @@ mbcs.prototype.nextChar = function(iter, det) {};
* Shift-JIS charset recognizer. * Shift-JIS charset recognizer.
*/ */
module.exports.sjis = function() { module.exports.sjis = function() {
this.name = "Shift_JIS"; this.name = function() {
this.language = "ja"; return "Shift_JIS";
};
this.language = function() {
return "ja";
};
// TODO: This set of data comes from the character frequency- // TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved // of-occurence analysis tool. The data needs to be moved
@ -243,8 +247,12 @@ util.inherits(module.exports.sjis, mbcs);
* Big5 charset recognizer. * Big5 charset recognizer.
*/ */
module.exports.big5 = function() { module.exports.big5 = function() {
this.name = "Big5"; this.name = function() {
this.language = "zh"; return "Big5";
};
this.language = function() {
return "zh";
};
// TODO: This set of data comes from the character frequency- // TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved // of-occurence analysis tool. The data needs to be moved
// into a resource and loaded from there. // into a resource and loaded from there.
@ -363,8 +371,12 @@ util.inherits(module.exports.euc, mbcs);
* is created and kept by the public CharsetDetector class * is created and kept by the public CharsetDetector class
*/ */
module.exports.euc_jp = function() { module.exports.euc_jp = function() {
this.name = "EUC-JP"; this.name = function() {
this.language = "ja"; return "EUC-JP";
};
this.language = function() {
return "ja";
};
// TODO: This set of data comes from the character frequency- // TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved // of-occurence analysis tool. The data needs to be moved
@ -391,8 +403,12 @@ util.inherits(module.exports.euc_jp, module.exports.euc);
* is created and kept by the public CharsetDetector class * is created and kept by the public CharsetDetector class
*/ */
module.exports.euc_kr = function() { module.exports.euc_kr = function() {
this.name = "EUC-KR"; this.name = function() {
this.language = "ko"; return "EUC-KR";
};
this.language = function() {
return "ko";
};
// TODO: This set of data comes from the character frequency- // TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved // of-occurence analysis tool. The data needs to be moved
@ -418,8 +434,12 @@ util.inherits(module.exports.euc_kr, module.exports.euc);
* GB-18030 recognizer. Uses simplified Chinese statistics. * GB-18030 recognizer. Uses simplified Chinese statistics.
*/ */
module.exports.gb_18030 = function() { module.exports.gb_18030 = function() {
this.name = "GB18030"; this.name = function() {
this.language = "zh"; return "GB18030";
};
this.language = function() {
return "zh";
};
/* /*
* (non-Javadoc) * (non-Javadoc)
* Get the next character value for EUC based encodings. * Get the next character value for EUC based encodings.

View File

@ -151,7 +151,7 @@ sbcs.prototype.match = function(det) {
} }
} }
var name = this.getName(det); var name = this.name(det);
return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang); return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang);
} }
@ -260,7 +260,7 @@ module.exports.ISO_8859_1 = function() {
]) ])
]; ];
this.getName = function(det) { this.name = function(det) {
if (typeof det == 'undefined') if (typeof det == 'undefined')
return "ISO-8859-1"; return "ISO-8859-1";
return det.fC1Bytes ? "windows-1252" : "ISO-8859-1"; return det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
@ -332,7 +332,7 @@ module.exports.ISO_8859_2 = function() {
]) ])
]; ];
this.getName = function(det) { this.name = function(det) {
if (typeof det == 'undefined') if (typeof det == 'undefined')
return "ISO-8859-2"; return "ISO-8859-2";
return det.fC1Bytes ? "windows-1250" : "ISO-8859-2"; return det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
@ -384,11 +384,11 @@ module.exports.ISO_8859_5 = function() {
0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520
]; ];
this.getName = function(det) { this.name = function(det) {
return "ISO-8859-5"; return "ISO-8859-5";
}; };
this.getLanguage = function() { this.language = function() {
return "ru"; return "ru";
}; };
}; };
@ -438,11 +438,11 @@ module.exports.ISO_8859_6 = function() {
0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620
]; ];
this.getName = function(det) { this.name = function(det) {
return "ISO-8859-6"; return "ISO-8859-6";
}; };
this.getLanguage = function() { this.language = function() {
return "ar"; return "ar";
}; };
}; };
@ -492,13 +492,13 @@ module.exports.ISO_8859_7 = function() {
0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20
]; ];
this.getName = function(det) { this.name = function(det) {
if (typeof det == 'undefined') if (typeof det == 'undefined')
return "ISO-8859-7"; return "ISO-8859-7";
return det.fC1Bytes ? "windows-1253" : "ISO-8859-7"; return det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
}; };
this.getLanguage = function() { this.language = function() {
return "el"; return "el";
}; };
}; };

View File

@ -7,7 +7,9 @@ var util = require('util'),
*/ */
module.exports.UTF_16BE = function() { module.exports.UTF_16BE = function() {
this.name = "UTF-16BE"; this.name = function() {
return "UTF-16BE";
};
this.match = function(det) { this.match = function(det) {
var input = det.fRawInput; var input = det.fRawInput;
@ -22,7 +24,9 @@ module.exports.UTF_16BE = function() {
}; };
module.exports.UTF_16LE = function() { module.exports.UTF_16LE = function() {
this.name = "UTF-16LE"; this.name = function() {
return "UTF-16LE";
};
this.match = function(det) { this.match = function(det) {
var input = det.fRawInput; var input = det.fRawInput;
@ -88,7 +92,9 @@ UTF_32.prototype.match = function(det) {
}; };
module.exports.UTF_32BE = function() { module.exports.UTF_32BE = function() {
this.name = "UTF_32BE"; this.name = function() {
return "UTF_32BE";
};
this.getChar = function(input, index) { this.getChar = function(input, index) {
return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
(input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
@ -97,7 +103,9 @@ module.exports.UTF_32BE = function() {
util.inherits(module.exports.UTF_32BE, UTF_32); util.inherits(module.exports.UTF_32BE, UTF_32);
module.exports.UTF_32LE = function() { module.exports.UTF_32LE = function() {
this.name = "UTF_32LE"; this.name = function() {
return "UTF_32LE";
};
this.getChar = function(input, index) { this.getChar = function(input, index) {
return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
(input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);

View File

@ -5,7 +5,9 @@ var Match = require ('../match');
* Charset recognizer for UTF-8 * Charset recognizer for UTF-8
*/ */
module.exports = function() { module.exports = function() {
this.name = "UTF-8"; this.name = function() {
return "UTF-8";
};
this.match = function(det) { this.match = function(det) {
var hasBOM = false, var hasBOM = false,

44
index.js Normal file
View File

@ -0,0 +1,44 @@
var UTF_8 = require('./encoding/utf8');
var unicode = require('./encoding/unicode');
var mbcs = require('./encoding/mbcs');
var iso2022 = require('./encoding/iso2022');
var recognisers = [
new UTF_8,
new unicode.UTF_16BE,
new unicode.UTF_16LE,
new unicode.UTF_32BE,
new unicode.UTF_32LE,
new mbcs.sjis,
new mbcs.big5,
new mbcs.euc_jp,
new mbcs.euc_kr,
new mbcs.gb_18030,
new iso2022.ISO_2022_JP,
new iso2022.ISO_2022_KR,
new iso2022.ISO_2022_CN
];
module.exports.detect = function(buffer) {
var det = {
fRawInput: buffer,
fRawLength: buffer.length,
fInputBytes: buffer,
fInputLen: buffer.length
};
var matches = [];
for (var i = recognisers.length - 1; i >= 0; i--) {
var recogniser = recognisers[i];
matches.push(rec.match(det));
};
matches.sort(function(a, b) {
return a.confidence - b.confidence;
});
return matches.pop().name;
};

View File

@ -1,6 +1,7 @@
module.exports = function(det, rec, confidence) { module.exports = function(det, rec, confidence, name, lang) {
// console.log(det, rec, confidence); // console.log(det, rec, confidence);
// this.res = 1; // this.res = 1;
this.confidence = confidence; this.confidence = confidence;
this.name = name || rec.getName();
} }

0
test.js Normal file
View File