Compare commits

...

1 Commits

Author SHA1 Message Date
Dmitry Shirokov 2fb68b1f48
feat: Support strings as inputs 2020-09-25 10:27:21 +10:00
5 changed files with 40 additions and 11 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ node_modules
coverage coverage
npm-debug.log npm-debug.log
lib lib
TODO.md

View File

@ -1,8 +1,14 @@
# chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet) # chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet)
Chardet is a character detection module for NodeJS written in pure Javascript. *Chardet* is a character detection module written in pure Javascript (Typescript). Module uses occurrence analysis to determine the most probable encoding.
Module is based on ICU project http://site.icu-project.org/, which uses character
occurency analysis to determine the most probable encoding. - Packed size is only **22 KB**
- No dependencies
- No native code / bindings
- Works in all environments: Node / Browser / Native
- Works on all platforms: Linux / Mac / Windows
- 100% written in Typescript
- Extensive code coverage
## Installation ## Installation
@ -87,3 +93,7 @@ Currently only these encodings are supported.
## Typescript? ## Typescript?
Yes. Type definitions are included. Yes. Type definitions are included.
### References
- ICU project http://site.icu-project.org/

View File

@ -2,7 +2,7 @@
"name": "chardet", "name": "chardet",
"version": "0.0.0-development", "version": "0.0.0-development",
"homepage": "https://github.com/runk/node-chardet", "homepage": "https://github.com/runk/node-chardet",
"description": "Character detector", "description": "Character encoding detector",
"license": "MIT", "license": "MIT",
"repository": { "repository": {
"type": "git", "type": "git",
@ -50,7 +50,11 @@
"utf8", "utf8",
"detector", "detector",
"chardet", "chardet",
"icu" "icu",
"character detection",
"character encoding",
"iconv",
"iconv-light"
], ],
"author": "Dmitry Shirokov <deadrunk@gmail.com>", "author": "Dmitry Shirokov <deadrunk@gmail.com>",
"contributors": [ "contributors": [

View File

@ -5,6 +5,8 @@ import fs from 'fs';
describe('chardet', () => { describe('chardet', () => {
const path = __dirname + '/test/data/encodings/utf8'; const path = __dirname + '/test/data/encodings/utf8';
const getInput = () => fs.readFileSync(path);
const expectedEncodingsFromPath = [ const expectedEncodingsFromPath = [
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
@ -25,8 +27,12 @@ describe('chardet', () => {
}); });
describe('#detect', () => { describe('#detect', () => {
it('should detect encoding', () => { it('should detect encoding from a buffer', () => {
expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8'); expect(chardet.detect(getInput())).toBe('UTF-8');
});
it('should detect encoding from a string', () => {
expect(chardet.detect(getInput().toString('utf-8'))).toBe('UTF-8');
}); });
}); });
@ -54,7 +60,12 @@ describe('chardet', () => {
describe('#analyse', () => { describe('#analyse', () => {
it('should return a list of encodings, sorted by confidence level in decending order', () => { it('should return a list of encodings, sorted by confidence level in decending order', () => {
const matches = chardet.analyse(fs.readFileSync(path)); const matches = chardet.analyse(getInput());
expect(matches).toEqual(expectedEncodingsFromPath);
});
it('should work for strings as inputs', () => {
const matches = chardet.analyse(getInput().toString('utf8'));
expect(matches).toEqual(expectedEncodingsFromPath); expect(matches).toEqual(expectedEncodingsFromPath);
}); });
}); });

View File

@ -49,13 +49,16 @@ const recognisers: Recogniser[] = [
]; ];
type DetectResult = Match[] | string | null; type DetectResult = Match[] | string | null;
type InputData = Uint8Array | string;
export const detect = (buffer: Uint8Array): string | null => { export const detect = (input: InputData): string | null => {
const matches: Match[] = analyse(buffer); const matches: Match[] = analyse(input);
return matches.length > 0 ? matches[0].name : null; return matches.length > 0 ? matches[0].name : null;
}; };
export const analyse = (buffer: Uint8Array): Match[] => { export const analyse = (input: InputData): Match[] => {
const buffer = typeof input === 'string' ? Buffer.from(input) : input;
// Tally up the byte occurrence statistics. // Tally up the byte occurrence statistics.
const fByteStats = []; const fByteStats = [];
for (let i = 0; i < 256; i++) fByteStats[i] = 0; for (let i = 0; i < 256; i++) fByteStats[i] = 0;