From 0c775fa651910da436c72a25361411bc2751724b Mon Sep 17 00:00:00 2001 From: zevanty <95801+zevanty@users.noreply.github.com> Date: Sun, 22 Apr 2018 00:57:54 -0700 Subject: [PATCH] Add option to return all matches --- README.md | 10 ++++++++++ index.js | 21 +++++++++++++-------- test/chardet.js | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7ca84e3..a975c84 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,16 @@ you can sample only first N bytes of the buffer: chardet.detectFile('/path/to/file', { sampleSize: 32 }, function(err, encoding) {}); ``` +## Returning more detailed results + +If you wish to see the full list of possible encodings: +```javascript +chardet.detectFile('/path/to/file', { returnAllMatches: true }, function(err, encodings) { + //encodings is an array of objects sorted by confidence value in decending order + //e.g. [{ confidence: 90, name: 'UTF-8'}, {confidence: 20, name: 'windows-1252', lang: 'fr'}] +}); +``` + ## Supported Encodings: * UTF-8 diff --git a/index.js b/index.js index e5bcedd..f75b4ba 100644 --- a/index.js +++ b/index.js @@ -35,7 +35,7 @@ var recognisers = [ new sbcs.KOI8_R ]; -module.exports.detect = function(buffer) { +module.exports.detect = function(buffer, opts) { // Tally up the byte occurence statistics. var fByteStats = []; @@ -62,15 +62,20 @@ module.exports.detect = function(buffer) { fInputLen: buffer.length }; - var match = recognisers.map(function(rec) { + var matches = recognisers.map(function(rec) { return rec.match(context); }).filter(function(match) { return !!match; }).sort(function(a, b) { - return a.confidence - b.confidence; - }).pop(); + return b.confidence - a.confidence; + }); - return match ? match.name : null; + if (opts && opts.returnAllMatches === true) { + return matches; + } + else { + return matches.length > 0 ? matches[0].name : null; + } }; module.exports.detectFile = function(filepath, opts, cb) { @@ -87,7 +92,7 @@ module.exports.detectFile = function(filepath, opts, cb) { } if (err) return cb(err, null); - cb(null, self.detect(buffer)); + cb(null, self.detect(buffer, opts)); }; if (opts && opts.sampleSize) { @@ -110,8 +115,8 @@ module.exports.detectFileSync = function(filepath, opts) { fs.readSync(fd, sample, 0, opts.sampleSize); fs.closeSync(fd); - return self.detect(sample); + return self.detect(sample, opts); } - return self.detect(fs.readFileSync(filepath)); + return self.detect(fs.readFileSync(filepath), opts); }; diff --git a/test/chardet.js b/test/chardet.js index 4b252b0..f7f3db4 100644 --- a/test/chardet.js +++ b/test/chardet.js @@ -5,11 +5,27 @@ var assert = require('assert'), describe('chardet', function() { var path = __dirname + '/data/encodings/utf8'; + var expectedEncodingsFromPath = JSON.stringify([ + { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, + { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, + { 'confidence': 19, 'name': 'KOI8-R', 'lang': undefined }, + { 'confidence': 10, 'name': 'Big5', 'lang': undefined }, + { 'confidence': 10, 'name': 'GB18030', 'lang': undefined }, + { 'confidence': 10, 'name': 'windows-1253', 'lang': undefined }, + { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' }, + { 'confidence': 4, 'name': 'windows-1254', 'lang': undefined }, + { 'confidence': 2, 'name': 'windows-1251', 'lang': undefined } + ]); describe('#detect', function() { it('should detect encoding', function() { assert.equal(chardet.detect(fs.readFileSync(path)), 'UTF-8'); }); + + it('should return a list of encodings, sorted by confidence level in decending order', function() { + var matches = chardet.detect(fs.readFileSync(path), { returnAllMatches: true }); + assert.equal(JSON.stringify(matches), expectedEncodingsFromPath); + }); }); describe('#detectFile', function() { @@ -28,6 +44,22 @@ describe('chardet', function() { done(); }); }); + + it('should return a list of encodings, sorted by confidence level in decending order', function() { + chardet.detectFile(path, { returnAllMatches: true }, function(err, res) { + assert.equal(err, null); + assert.equal(JSON.stringify(res), expectedEncodingsFromPath); + done(); + }); + }); + + it('should return a list of encodings even with smaller sample size, sorted by confidence level in decending order', function() { + chardet.detectFile(path, { sampleSize: 32, returnAllMatches: true }, function(err, res) { + assert.equal(err, null); + assert.equal(JSON.stringify(res), expectedEncodingsFromPath); + done(); + }); + }); }); describe('#detectFileSync', function() { @@ -38,5 +70,22 @@ describe('chardet', function() { it('should detect encoding with smaller sample size', function() { assert.equal(chardet.detectFileSync(path, { sampleSize: 32 }), 'UTF-8'); }); + + it('should return a list of encodings, sorted by confidence level in decending order', function() { + var matches = chardet.detectFileSync(path, { returnAllMatches: true }); + assert.equal(JSON.stringify(matches), expectedEncodingsFromPath); + }); + + it('should return a list of encodings even with smaller sample size, sorted by confidence level in decending order', function() { + var matches = chardet.detectFileSync(path, { sampleSize: 32, returnAllMatches: true }); + assert.equal(JSON.stringify(matches), JSON.stringify([ + {'confidence': 100, 'name': 'UTF-8', 'lang': undefined}, + {'confidence': 10, 'name': 'Shift-JIS', 'lang': undefined}, + {'confidence': 10, 'name': 'windows-1252', 'lang': 'it'}, + {'confidence': 10, 'name': 'windows-1250', 'lang': 'hu'}, + {'confidence': 10, 'name': 'windows-1253', 'lang': undefined}, + {'confidence': 10, 'name': 'windows-1251', 'lang': undefined} + ])); + }); }); });