chardet/index.js

152 lines
3.3 KiB
JavaScript
Raw Permalink Normal View History

2013-03-04 19:47:01 +00:00
2013-04-29 14:31:48 +00:00
var fs = require('fs');
2013-03-04 19:47:01 +00:00
var utf8 = require('./encoding/utf8'),
unicode = require('./encoding/unicode'),
mbcs = require('./encoding/mbcs'),
sbcs = require('./encoding/sbcs'),
iso2022 = require('./encoding/iso2022');
2013-03-04 19:47:01 +00:00
2013-04-29 14:31:48 +00:00
var self = this;
2013-03-04 19:47:01 +00:00
var recognisers = [
new utf8,
new unicode.UTF_16BE,
new unicode.UTF_16LE,
new unicode.UTF_32BE,
new unicode.UTF_32LE,
new mbcs.sjis,
new mbcs.big5,
new mbcs.euc_jp,
new mbcs.euc_kr,
new mbcs.gb_18030,
new iso2022.ISO_2022_JP,
new iso2022.ISO_2022_KR,
new iso2022.ISO_2022_CN,
new sbcs.ISO_8859_1,
new sbcs.ISO_8859_2,
new sbcs.ISO_8859_5,
new sbcs.ISO_8859_6,
new sbcs.ISO_8859_7,
new sbcs.ISO_8859_8,
new sbcs.ISO_8859_9,
new sbcs.windows_1251,
new sbcs.windows_1256,
new sbcs.KOI8_R
2013-03-04 19:47:01 +00:00
];
2018-04-22 07:57:54 +00:00
module.exports.detect = function(buffer, opts) {
2013-03-04 19:47:01 +00:00
// Tally up the byte occurence statistics.
var fByteStats = [];
for (var i = 0; i < 256; i++)
fByteStats[i] = 0;
2013-05-04 09:27:28 +00:00
for (var i = buffer.length - 1; i >= 0; i--)
fByteStats[buffer[i] & 0x00ff]++;
2013-05-04 09:27:28 +00:00
var fC1Bytes = false;
for (var i = 0x80; i <= 0x9F; i += 1) {
if (fByteStats[i] != 0) {
fC1Bytes = true;
break;
2013-05-04 09:27:28 +00:00
}
}
2013-05-04 09:27:28 +00:00
var context = {
fByteStats: fByteStats,
fC1Bytes: fC1Bytes,
fRawInput: buffer,
fRawLength: buffer.length,
fInputBytes: buffer,
fInputLen: buffer.length
};
2013-03-04 19:47:01 +00:00
2018-04-22 07:57:54 +00:00
var matches = recognisers.map(function(rec) {
return rec.match(context);
}).filter(function(match) {
return !!match;
}).sort(function(a, b) {
2018-04-22 07:57:54 +00:00
return b.confidence - a.confidence;
});
2013-03-04 19:47:01 +00:00
2018-04-22 07:57:54 +00:00
if (opts && opts.returnAllMatches === true) {
return matches;
}
else {
return matches.length > 0 ? matches[0].name : null;
}
2013-04-29 14:31:48 +00:00
};
2017-10-16 00:42:49 +00:00
module.exports.detectFile = function(filepath, opts, cb) {
if (typeof opts === 'function') {
cb = opts;
opts = undefined;
}
var fd;
var handler = function(err, buffer) {
if (fd) {
fs.closeSync(fd);
}
if (err) return cb(err, null);
2018-04-22 07:57:54 +00:00
cb(null, self.detect(buffer, opts));
2017-10-16 00:42:49 +00:00
};
if (opts && opts.sampleSize) {
fd = fs.openSync(filepath, 'r'),
sample = Buffer.allocUnsafe(opts.sampleSize);
2017-10-16 00:42:49 +00:00
fs.read(fd, sample, 0, opts.sampleSize, null, function(err) {
handler(err, sample);
});
return;
}
fs.readFile(filepath, handler);
2013-04-29 14:31:48 +00:00
};
2017-10-16 00:42:49 +00:00
module.exports.detectFileSync = function(filepath, opts) {
if (opts && opts.sampleSize) {
var fd = fs.openSync(filepath, 'r'),
sample = Buffer.allocUnsafe(opts.sampleSize);
2017-10-16 00:42:49 +00:00
fs.readSync(fd, sample, 0, opts.sampleSize);
fs.closeSync(fd);
2018-04-22 07:57:54 +00:00
return self.detect(sample, opts);
2017-10-16 00:42:49 +00:00
}
2018-04-22 07:57:54 +00:00
return self.detect(fs.readFileSync(filepath), opts);
2013-08-06 01:35:58 +00:00
};
// Wrappers for the previous functions to return all encodings
module.exports.detectAll = function(buffer, opts) {
if (typeof opts !== 'object') {
opts = {};
}
opts.returnAllMatches = true;
return self.detect(buffer, opts);
}
module.exports.detectFileAll = function(filepath, opts, cb) {
if (typeof opts === 'function') {
cb = opts;
opts = undefined;
}
if (typeof opts !== 'object') {
opts = {};
}
opts.returnAllMatches = true;
self.detectFile(filepath, opts, cb);
}
module.exports.detectFileAllSync = function(filepath, opts) {
if (typeof opts !== 'object') {
opts = {};
}
opts.returnAllMatches = true;
return self.detectFileSync(filepath, opts);
}