Merge pull request #6 from runk/optimize-file-read

Optimize file read
This commit is contained in:
Dmitry Shirokov 2017-10-16 13:44:12 +11:00 committed by GitHub
commit 320e3899af
3 changed files with 59 additions and 6 deletions

View File

@ -23,6 +23,15 @@ chardet.detectFile('/path/to/file', function(err, encoding) {});
chardet.detectFileSync('/path/to/file'); chardet.detectFileSync('/path/to/file');
``` ```
## Working with large data sets
Sometimes, when data set is huge and you want to optimize performace (in tradeoff of less accuracy),
you can sample only first N bytes of the buffer:
```javascript
chardet.detectFile('/path/to/file', { sampleSize: 32 }, function(err, encoding) {});
```
## Supported Encodings: ## Supported Encodings:
* UTF-8 * UTF-8

View File

@ -73,13 +73,45 @@ module.exports.detect = function(buffer) {
return match ? match.name : null; return match ? match.name : null;
}; };
module.exports.detectFile = function(filepath, fn) { module.exports.detectFile = function(filepath, opts, cb) {
fs.readFile(filepath, function(err, res) { if (typeof opts === 'function') {
if (err) return fn(err, null); cb = opts;
fn(null, self.detect(res)); opts = undefined;
}); }
var fd;
var handler = function(err, buffer) {
if (fd) {
fs.closeSync(fd);
}
if (err) return cb(err, null);
cb(null, self.detect(buffer));
};
if (opts && opts.sampleSize) {
fd = fs.openSync(filepath, 'r'),
sample = new Buffer(opts.sampleSize);
fs.read(fd, sample, 0, opts.sampleSize, null, function(err) {
handler(err, sample);
});
return;
}
fs.readFile(filepath, handler);
}; };
module.exports.detectFileSync = function(filepath) { module.exports.detectFileSync = function(filepath, opts) {
if (opts && opts.sampleSize) {
var fd = fs.openSync(filepath, 'r'),
sample = new Buffer(opts.sampleSize);
fs.readSync(fd, sample, 0, opts.sampleSize);
fs.closeSync(fd);
return self.detect(sample);
}
return self.detect(fs.readFileSync(filepath)); return self.detect(fs.readFileSync(filepath));
}; };

View File

@ -20,11 +20,23 @@ describe('chardet', function() {
done(); done();
}); });
}); });
it('should detect encoding with smaller sample size', function(done) {
chardet.detectFile(path, { sampleSize: 32 }, function(err, res) {
assert.equal(err, null);
assert.equal(res, 'UTF-8');
done();
});
});
}); });
describe('#detectFileSync', function() { describe('#detectFileSync', function() {
it('should detect encoding', function() { it('should detect encoding', function() {
assert.equal(chardet.detectFileSync(path), 'UTF-8'); assert.equal(chardet.detectFileSync(path), 'UTF-8');
}); });
it('should detect encoding with smaller sample size', function() {
assert.equal(chardet.detectFileSync(path, { sampleSize: 32 }), 'UTF-8');
});
}); });
}); });