Add option to return all matches

2018-04-22 00:57:54 -07:00 · 2018-04-22 00:57:54 -07:00 · 0c775fa651
parent 049fe54fa1
commit 0c775fa651
3 changed files with 72 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -32,6 +32,16 @@ you can sample only first N bytes of the buffer:
 chardet.detectFile('/path/to/file', { sampleSize: 32 }, function(err, encoding) {});
 ```

+## Returning more detailed results
+
+If you wish to see the full list of possible encodings:
+```javascript
+chardet.detectFile('/path/to/file', { returnAllMatches: true }, function(err, encodings) {
+  //encodings is an array of objects sorted by confidence value in decending order
+  //e.g. [{ confidence: 90, name: 'UTF-8'}, {confidence: 20, name: 'windows-1252', lang: 'fr'}]
+});
+```
+
 ## Supported Encodings:

 * UTF-8
--- a/index.js
+++ b/index.js
@ -35,7 +35,7 @@ var recognisers = [
  new sbcs.KOI8_R
 ];

-module.exports.detect = function(buffer) {
+module.exports.detect = function(buffer, opts) {

  // Tally up the byte occurence statistics.
  var fByteStats = [];
@ -62,15 +62,20 @@ module.exports.detect = function(buffer) {
    fInputLen:   buffer.length
  };

-  var match = recognisers.map(function(rec) {
+  var matches = recognisers.map(function(rec) {
    return rec.match(context);
  }).filter(function(match) {
    return !!match;
  }).sort(function(a, b) {
-    return a.confidence - b.confidence;
-  }).pop();
+    return b.confidence - a.confidence;
+  });

-  return match ? match.name : null;
+  if (opts && opts.returnAllMatches === true) {
+    return matches;
+  }
+  else {
+    return matches.length > 0 ? matches[0].name : null;
+  }
 };

 module.exports.detectFile = function(filepath, opts, cb) {
@ -87,7 +92,7 @@ module.exports.detectFile = function(filepath, opts, cb) {
    }

    if (err) return cb(err, null);
-    cb(null, self.detect(buffer));
+    cb(null, self.detect(buffer, opts));
  };

  if (opts && opts.sampleSize) {
@ -110,8 +115,8 @@ module.exports.detectFileSync = function(filepath, opts) {

    fs.readSync(fd, sample, 0, opts.sampleSize);
    fs.closeSync(fd);
-    return self.detect(sample);
+    return self.detect(sample, opts);
  }

-  return self.detect(fs.readFileSync(filepath));
+  return self.detect(fs.readFileSync(filepath), opts);
 };
--- a/test/chardet.js
+++ b/test/chardet.js
@ -5,11 +5,27 @@ var assert = require('assert'),
 describe('chardet', function() {

  var path = __dirname + '/data/encodings/utf8';
+  var expectedEncodingsFromPath = JSON.stringify([
+    { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
+    { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
+    { 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined },
+    { 'confidence': 10, 'name': 'Big5', 'lang': undefined },
+    { 'confidence': 10, 'name': 'GB18030', 'lang': undefined },
+    { 'confidence': 10, 'name': 'windows-1253', 'lang': undefined },
+    { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
+    { 'confidence': 4, 'name': 'windows-1254', 'lang': undefined },
+    { 'confidence': 2, 'name': 'windows-1251', 'lang': undefined }
+  ]);

  describe('#detect', function() {
    it('should detect encoding', function() {
      assert.equal(chardet.detect(fs.readFileSync(path)), 'UTF-8');
    });
+
+    it('should return a list of encodings, sorted by confidence level in decending order', function() {
+      var matches = chardet.detect(fs.readFileSync(path), { returnAllMatches: true });
+      assert.equal(JSON.stringify(matches), expectedEncodingsFromPath);
+    });
  });

  describe('#detectFile', function() {
@ -28,6 +44,22 @@ describe('chardet', function() {
        done();
      });
    });
+
+    it('should return a list of encodings, sorted by confidence level in decending order', function() {
+      chardet.detectFile(path, { returnAllMatches: true }, function(err, res) {
+        assert.equal(err, null);
+        assert.equal(JSON.stringify(res), expectedEncodingsFromPath);
+        done();
+      });
+    });
+
+    it('should return a list of encodings even with smaller sample size, sorted by confidence level in decending order', function() {
+      chardet.detectFile(path, { sampleSize: 32, returnAllMatches: true }, function(err, res) {
+        assert.equal(err, null);
+        assert.equal(JSON.stringify(res), expectedEncodingsFromPath);
+        done();
+      });
+    });
  });

  describe('#detectFileSync', function() {
@ -38,5 +70,22 @@ describe('chardet', function() {
    it('should detect encoding with smaller sample size', function() {
      assert.equal(chardet.detectFileSync(path, { sampleSize: 32 }), 'UTF-8');
    });
+
+    it('should return a list of encodings, sorted by confidence level in decending order', function() {
+      var matches = chardet.detectFileSync(path, { returnAllMatches: true });
+      assert.equal(JSON.stringify(matches), expectedEncodingsFromPath);
+    });
+
+    it('should return a list of encodings even with smaller sample size, sorted by confidence level in decending order', function() {
+      var matches = chardet.detectFileSync(path, { sampleSize: 32, returnAllMatches: true });
+      assert.equal(JSON.stringify(matches), JSON.stringify([
+        {'confidence': 100, 'name': 'UTF-8', 'lang': undefined},
+        {'confidence': 10, 'name': 'Shift-JIS', 'lang': undefined},
+        {'confidence': 10, 'name': 'windows-1252', 'lang': 'it'},
+        {'confidence': 10, 'name': 'windows-1250', 'lang': 'hu'},
+        {'confidence': 10, 'name': 'windows-1253', 'lang': undefined},
+        {'confidence': 10, 'name': 'windows-1251', 'lang': undefined}
+      ]));
+    });
  });
 });