main class

2013-03-04 23:47:01 +04:00 · 2013-03-04 23:47:01 +04:00 · dcaa9ba5d7
parent a93a9cd557
commit dcaa9ba5d7
8 changed files with 109 additions and 28 deletions
--- a/encoding/iso2022.js
+++ b/encoding/iso2022.js
@ -98,7 +98,9 @@ ISO_2022.prototype.match = function(det) {
 };

 module.exports.ISO_2022_JP = function() {
-    this.name = "ISO-2022-JP";
+    this.name = function() {
+        return "ISO-2022-JP";
+    };
    this.escapeSequences = [
        [ 0x1b, 0x24, 0x28, 0x43 ],   // KS X 1001:1992
        [ 0x1b, 0x24, 0x28, 0x44 ],   // JIS X 212-1990
@ -119,7 +121,9 @@ util.inherits(module.exports.ISO_2022_JP, ISO_2022);


 module.exports.ISO_2022_KR = function() {
-    this.name = "ISO-2022-KR";
+    this.name = function() {
+        return "ISO-2022-KR";
+    };
    this.escapeSequences = [
        [ 0x1b, 0x24, 0x29, 0x43 ]
    ];
@ -129,7 +133,9 @@ util.inherits(module.exports.ISO_2022_KR, ISO_2022);


 module.exports.ISO_2022_CN = function() {
-    this.name = "ISO-2022-CN";
+    this.name = function() {
+        return "ISO-2022-CN";
+    };
    this.escapeSequences = [
        [ 0x1b, 0x24, 0x29, 0x41 ],   // GB 2312-80
        [ 0x1b, 0x24, 0x29, 0x47 ],   // CNS 11643-1992 Plane 1
--- a/encoding/mbcs.js
+++ b/encoding/mbcs.js
@ -193,8 +193,12 @@ mbcs.prototype.nextChar = function(iter, det) {};
 * Shift-JIS charset recognizer.
 */
 module.exports.sjis = function() {
-    this.name = "Shift_JIS";
-    this.language = "ja";
+    this.name = function() {
+        return "Shift_JIS";
+    };
+    this.language = function() {
+        return "ja";
+    };

    // TODO:  This set of data comes from the character frequency-
    //        of-occurence analysis tool.  The data needs to be moved
@ -243,8 +247,12 @@ util.inherits(module.exports.sjis, mbcs);
 *   Big5 charset recognizer.
 */
 module.exports.big5 = function() {
-    this.name = "Big5";
-    this.language = "zh";
+    this.name = function() {
+        return "Big5";
+    };
+    this.language = function() {
+        return "zh";
+    };
    // TODO:  This set of data comes from the character frequency-
    //        of-occurence analysis tool.  The data needs to be moved
    //        into a resource and loaded from there.
@ -363,8 +371,12 @@ util.inherits(module.exports.euc, mbcs);
 *    is created and kept by the public CharsetDetector class
 */
 module.exports.euc_jp = function() {
-    this.name = "EUC-JP";
-    this.language = "ja";
+    this.name = function() {
+        return "EUC-JP";
+    };
+    this.language = function() {
+        return "ja";
+    };

    // TODO:  This set of data comes from the character frequency-
    //        of-occurence analysis tool.  The data needs to be moved
@ -391,8 +403,12 @@ util.inherits(module.exports.euc_jp, module.exports.euc);
 *    is created and kept by the public CharsetDetector class
 */
 module.exports.euc_kr = function() {
-    this.name = "EUC-KR";
-    this.language = "ko";
+    this.name = function() {
+        return "EUC-KR";
+    };
+    this.language = function() {
+        return "ko";
+    };

    // TODO:  This set of data comes from the character frequency-
    //        of-occurence analysis tool.  The data needs to be moved
@ -418,8 +434,12 @@ util.inherits(module.exports.euc_kr, module.exports.euc);
 *   GB-18030 recognizer. Uses simplified Chinese statistics.
 */
 module.exports.gb_18030 = function() {
-    this.name = "GB18030";
-    this.language = "zh";
+    this.name = function() {
+        return "GB18030";
+    };
+    this.language = function() {
+        return "zh";
+    };
    /*
     *  (non-Javadoc)
     *  Get the next character value for EUC based encodings.
--- a/encoding/sbcs.js
+++ b/encoding/sbcs.js
@ -151,7 +151,7 @@ sbcs.prototype.match = function(det) {
            }
        }

-        var name = this.getName(det);
+        var name = this.name(det);
        return bestConfidenceSoFar <= 0 ? null : new Match(det, this, bestConfidenceSoFar, name, lang);
    }

@ -260,7 +260,7 @@ module.exports.ISO_8859_1 = function() {
        ])
    ];

-    this.getName = function(det) {
+    this.name = function(det) {
        if (typeof det == 'undefined')
            return "ISO-8859-1";
        return det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
@ -332,7 +332,7 @@ module.exports.ISO_8859_2 = function() {
        ])
    ];

-    this.getName = function(det) {
+    this.name = function(det) {
        if (typeof det == 'undefined')
            return "ISO-8859-2";
        return det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
@ -384,11 +384,11 @@ module.exports.ISO_8859_5 = function() {
        0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520
    ];

-    this.getName = function(det) {
+    this.name = function(det) {
        return "ISO-8859-5";
    };

-    this.getLanguage = function() {
+    this.language = function() {
        return "ru";
    };
 };
@ -438,11 +438,11 @@ module.exports.ISO_8859_6 = function() {
        0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620
    ];

-    this.getName = function(det) {
+    this.name = function(det) {
        return "ISO-8859-6";
    };

-    this.getLanguage = function() {
+    this.language = function() {
        return "ar";
    };
 };
@ -492,13 +492,13 @@ module.exports.ISO_8859_7 = function() {
        0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20
    ];

-    this.getName = function(det) {
+    this.name = function(det) {
        if (typeof det == 'undefined')
            return "ISO-8859-7";
        return det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
    };

-    this.getLanguage = function() {
+    this.language = function() {
        return "el";
    };
 };
--- a/encoding/unicode.js
+++ b/encoding/unicode.js
@ -7,7 +7,9 @@ var util = require('util'),
 */

 module.exports.UTF_16BE = function() {
-    this.name = "UTF-16BE";
+    this.name = function() {
+        return "UTF-16BE";
+    };
    this.match = function(det) {
        var input = det.fRawInput;

@ -22,7 +24,9 @@ module.exports.UTF_16BE = function() {
 };

 module.exports.UTF_16LE = function() {
-    this.name = "UTF-16LE";
+    this.name = function() {
+        return "UTF-16LE";
+    };
    this.match = function(det) {
        var input = det.fRawInput;

@ -88,7 +92,9 @@ UTF_32.prototype.match = function(det) {
 };

 module.exports.UTF_32BE = function() {
-    this.name = "UTF_32BE";
+    this.name = function() {
+        return "UTF_32BE";
+    };
    this.getChar = function(input, index) {
        return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
               (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
@ -97,7 +103,9 @@ module.exports.UTF_32BE = function() {
 util.inherits(module.exports.UTF_32BE, UTF_32);

 module.exports.UTF_32LE = function() {
-    this.name = "UTF_32LE";
+    this.name = function() {
+        return "UTF_32LE";
+    };
    this.getChar = function(input, index) {
        return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
               (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
--- a/encoding/utf8.js
+++ b/encoding/utf8.js
@ -5,7 +5,9 @@ var Match = require ('../match');
 * Charset recognizer for UTF-8
 */
 module.exports = function() {
-    this.name = "UTF-8";
+    this.name = function() {
+        return "UTF-8";
+    };
    this.match = function(det) {

        var hasBOM = false,
--- a/index.js
+++ b/index.js
@ -0,0 +1,44 @@
+
+
+var UTF_8 = require('./encoding/utf8');
+var unicode = require('./encoding/unicode');
+var mbcs = require('./encoding/mbcs');
+var iso2022 = require('./encoding/iso2022');
+
+var recognisers = [
+    new UTF_8,
+    new unicode.UTF_16BE,
+    new unicode.UTF_16LE,
+    new unicode.UTF_32BE,
+    new unicode.UTF_32LE,
+    new mbcs.sjis,
+    new mbcs.big5,
+    new mbcs.euc_jp,
+    new mbcs.euc_kr,
+    new mbcs.gb_18030,
+    new iso2022.ISO_2022_JP,
+    new iso2022.ISO_2022_KR,
+    new iso2022.ISO_2022_CN
+];
+
+module.exports.detect = function(buffer) {
+
+    var det = {
+        fRawInput:   buffer,
+        fRawLength:  buffer.length,
+        fInputBytes: buffer,
+        fInputLen:   buffer.length
+    };
+
+    var matches = [];
+    for (var i = recognisers.length - 1; i >= 0; i--) {
+        var recogniser = recognisers[i];
+        matches.push(rec.match(det));
+    };
+
+    matches.sort(function(a, b) {
+        return a.confidence - b.confidence;
+    });
+
+    return matches.pop().name;
+};
--- a/match.js
+++ b/match.js
@ -1,6 +1,7 @@

-module.exports = function(det, rec, confidence) {
+module.exports = function(det, rec, confidence, name, lang) {
    // console.log(det, rec, confidence);
    // this.res = 1;
    this.confidence = confidence;
+    this.name       = name || rec.getName();
 }
--- a/test.js
+++ b/test.js