From dcaa9ba5d7f096814af93802ba7b50d62f8aa651 Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Mon, 4 Mar 2013 23:47:01 +0400 Subject: [PATCH] main class --- encoding/iso2022.js | 12 +++++++++--- encoding/mbcs.js | 40 ++++++++++++++++++++++++++++++---------- encoding/sbcs.js | 18 +++++++++--------- encoding/unicode.js | 16 ++++++++++++---- encoding/utf8.js | 4 +++- index.js | 44 ++++++++++++++++++++++++++++++++++++++++++++ match.js | 3 ++- test.js | 0 8 files changed, 109 insertions(+), 28 deletions(-) create mode 100644 index.js create mode 100644 test.js diff --git a/encoding/iso2022.js b/encoding/iso2022.js index 610bad2..06ec8ec 100644 --- a/encoding/iso2022.js +++ b/encoding/iso2022.js @@ -98,7 +98,9 @@ ISO_2022.prototype.match = function(det) { }; module.exports.ISO_2022_JP = function() { - this.name = "ISO-2022-JP"; + this.name = function() { + return "ISO-2022-JP"; + }; this.escapeSequences = [ [ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992 [ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990 @@ -119,7 +121,9 @@ util.inherits(module.exports.ISO_2022_JP, ISO_2022); module.exports.ISO_2022_KR = function() { - this.name = "ISO-2022-KR"; + this.name = function() { + return "ISO-2022-KR"; + }; this.escapeSequences = [ [ 0x1b, 0x24, 0x29, 0x43 ] ]; @@ -129,7 +133,9 @@ util.inherits(module.exports.ISO_2022_KR, ISO_2022); module.exports.ISO_2022_CN = function() { - this.name = "ISO-2022-CN"; + this.name = function() { + return "ISO-2022-CN"; + }; this.escapeSequences = [ [ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80 [ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1 diff --git a/encoding/mbcs.js b/encoding/mbcs.js index fba37c1..335b0d0 100644 --- a/encoding/mbcs.js +++ b/encoding/mbcs.js @@ -193,8 +193,12 @@ mbcs.prototype.nextChar = function(iter, det) {}; * Shift-JIS charset recognizer. */ module.exports.sjis = function() { - this.name = "Shift_JIS"; - this.language = "ja"; + this.name = function() { + return "Shift_JIS"; + }; + this.language = function() { + return "ja"; + }; // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved @@ -243,8 +247,12 @@ util.inherits(module.exports.sjis, mbcs); * Big5 charset recognizer. */ module.exports.big5 = function() { - this.name = "Big5"; - this.language = "zh"; + this.name = function() { + return "Big5"; + }; + this.language = function() { + return "zh"; + }; // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. @@ -363,8 +371,12 @@ util.inherits(module.exports.euc, mbcs); * is created and kept by the public CharsetDetector class */ module.exports.euc_jp = function() { - this.name = "EUC-JP"; - this.language = "ja"; + this.name = function() { + return "EUC-JP"; + }; + this.language = function() { + return "ja"; + }; // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved @@ -391,8 +403,12 @@ util.inherits(module.exports.euc_jp, module.exports.euc); * is created and kept by the public CharsetDetector class */ module.exports.euc_kr = function() { - this.name = "EUC-KR"; - this.language = "ko"; + this.name = function() { + return "EUC-KR"; + }; + this.language = function() { + return "ko"; + }; // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved @@ -418,8 +434,12 @@ util.inherits(module.exports.euc_kr, module.exports.euc); * GB-18030 recognizer. Uses simplified Chinese statistics. */ module.exports.gb_18030 = function() { - this.name = "GB18030"; - this.language = "zh"; + this.name = function() { + return "GB18030"; + }; + this.language = function() { + return "zh"; + }; /* * (non-Javadoc) * Get the next character value for EUC based encodings. diff --git a/encoding/sbcs.js b/encoding/sbcs.js index d136c48..2d0842c 100644 --- a/encoding/sbcs.js +++ b/encoding/sbcs.js @@ -151,7 +151,7 @@ sbcs.prototype.match = function(det) { } } - var name = this.getName(det); + var name = this.name(det); return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang); } @@ -260,7 +260,7 @@ module.exports.ISO_8859_1 = function() { ]) ]; - this.getName = function(det) { + this.name = function(det) { if (typeof det == 'undefined') return "ISO-8859-1"; return det.fC1Bytes ? "windows-1252" : "ISO-8859-1"; @@ -332,7 +332,7 @@ module.exports.ISO_8859_2 = function() { ]) ]; - this.getName = function(det) { + this.name = function(det) { if (typeof det == 'undefined') return "ISO-8859-2"; return det.fC1Bytes ? "windows-1250" : "ISO-8859-2"; @@ -384,11 +384,11 @@ module.exports.ISO_8859_5 = function() { 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520 ]; - this.getName = function(det) { + this.name = function(det) { return "ISO-8859-5"; }; - this.getLanguage = function() { + this.language = function() { return "ru"; }; }; @@ -438,11 +438,11 @@ module.exports.ISO_8859_6 = function() { 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620 ]; - this.getName = function(det) { + this.name = function(det) { return "ISO-8859-6"; }; - this.getLanguage = function() { + this.language = function() { return "ar"; }; }; @@ -492,13 +492,13 @@ module.exports.ISO_8859_7 = function() { 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20 ]; - this.getName = function(det) { + this.name = function(det) { if (typeof det == 'undefined') return "ISO-8859-7"; return det.fC1Bytes ? "windows-1253" : "ISO-8859-7"; }; - this.getLanguage = function() { + this.language = function() { return "el"; }; }; diff --git a/encoding/unicode.js b/encoding/unicode.js index 9298352..b9d193a 100644 --- a/encoding/unicode.js +++ b/encoding/unicode.js @@ -7,7 +7,9 @@ var util = require('util'), */ module.exports.UTF_16BE = function() { - this.name = "UTF-16BE"; + this.name = function() { + return "UTF-16BE"; + }; this.match = function(det) { var input = det.fRawInput; @@ -22,7 +24,9 @@ module.exports.UTF_16BE = function() { }; module.exports.UTF_16LE = function() { - this.name = "UTF-16LE"; + this.name = function() { + return "UTF-16LE"; + }; this.match = function(det) { var input = det.fRawInput; @@ -88,7 +92,9 @@ UTF_32.prototype.match = function(det) { }; module.exports.UTF_32BE = function() { - this.name = "UTF_32BE"; + this.name = function() { + return "UTF_32BE"; + }; this.getChar = function(input, index) { return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); @@ -97,7 +103,9 @@ module.exports.UTF_32BE = function() { util.inherits(module.exports.UTF_32BE, UTF_32); module.exports.UTF_32LE = function() { - this.name = "UTF_32LE"; + this.name = function() { + return "UTF_32LE"; + }; this.getChar = function(input, index) { return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); diff --git a/encoding/utf8.js b/encoding/utf8.js index c0cd442..bcd0e29 100644 --- a/encoding/utf8.js +++ b/encoding/utf8.js @@ -5,7 +5,9 @@ var Match = require ('../match'); * Charset recognizer for UTF-8 */ module.exports = function() { - this.name = "UTF-8"; + this.name = function() { + return "UTF-8"; + }; this.match = function(det) { var hasBOM = false, diff --git a/index.js b/index.js new file mode 100644 index 0000000..916487b --- /dev/null +++ b/index.js @@ -0,0 +1,44 @@ + + +var UTF_8 = require('./encoding/utf8'); +var unicode = require('./encoding/unicode'); +var mbcs = require('./encoding/mbcs'); +var iso2022 = require('./encoding/iso2022'); + +var recognisers = [ + new UTF_8, + new unicode.UTF_16BE, + new unicode.UTF_16LE, + new unicode.UTF_32BE, + new unicode.UTF_32LE, + new mbcs.sjis, + new mbcs.big5, + new mbcs.euc_jp, + new mbcs.euc_kr, + new mbcs.gb_18030, + new iso2022.ISO_2022_JP, + new iso2022.ISO_2022_KR, + new iso2022.ISO_2022_CN +]; + +module.exports.detect = function(buffer) { + + var det = { + fRawInput: buffer, + fRawLength: buffer.length, + fInputBytes: buffer, + fInputLen: buffer.length + }; + + var matches = []; + for (var i = recognisers.length - 1; i >= 0; i--) { + var recogniser = recognisers[i]; + matches.push(rec.match(det)); + }; + + matches.sort(function(a, b) { + return a.confidence - b.confidence; + }); + + return matches.pop().name; +}; \ No newline at end of file diff --git a/match.js b/match.js index b8ec5a6..cc58223 100644 --- a/match.js +++ b/match.js @@ -1,6 +1,7 @@ -module.exports = function(det, rec, confidence) { +module.exports = function(det, rec, confidence, name, lang) { // console.log(det, rec, confidence); // this.res = 1; this.confidence = confidence; + this.name = name || rec.getName(); } \ No newline at end of file diff --git a/test.js b/test.js new file mode 100644 index 0000000..e69de29