Compare commits
1 Commits
master
...
accept-str
Author | SHA1 | Date |
---|---|---|
Dmitry Shirokov | 2fb68b1f48 |
|
@ -4,3 +4,4 @@ node_modules
|
||||||
coverage
|
coverage
|
||||||
npm-debug.log
|
npm-debug.log
|
||||||
lib
|
lib
|
||||||
|
TODO.md
|
||||||
|
|
16
README.md
16
README.md
|
@ -1,8 +1,14 @@
|
||||||
# chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet)
|
# chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet)
|
||||||
|
|
||||||
Chardet is a character detection module for NodeJS written in pure Javascript.
|
*Chardet* is a character detection module written in pure Javascript (Typescript). Module uses occurrence analysis to determine the most probable encoding.
|
||||||
Module is based on ICU project http://site.icu-project.org/, which uses character
|
|
||||||
occurency analysis to determine the most probable encoding.
|
- Packed size is only **22 KB**
|
||||||
|
- No dependencies
|
||||||
|
- No native code / bindings
|
||||||
|
- Works in all environments: Node / Browser / Native
|
||||||
|
- Works on all platforms: Linux / Mac / Windows
|
||||||
|
- 100% written in Typescript
|
||||||
|
- Extensive code coverage
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
@ -87,3 +93,7 @@ Currently only these encodings are supported.
|
||||||
## Typescript?
|
## Typescript?
|
||||||
|
|
||||||
Yes. Type definitions are included.
|
Yes. Type definitions are included.
|
||||||
|
|
||||||
|
### References
|
||||||
|
|
||||||
|
- ICU project http://site.icu-project.org/
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
"name": "chardet",
|
"name": "chardet",
|
||||||
"version": "0.0.0-development",
|
"version": "0.0.0-development",
|
||||||
"homepage": "https://github.com/runk/node-chardet",
|
"homepage": "https://github.com/runk/node-chardet",
|
||||||
"description": "Character detector",
|
"description": "Character encoding detector",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
@ -50,7 +50,11 @@
|
||||||
"utf8",
|
"utf8",
|
||||||
"detector",
|
"detector",
|
||||||
"chardet",
|
"chardet",
|
||||||
"icu"
|
"icu",
|
||||||
|
"character detection",
|
||||||
|
"character encoding",
|
||||||
|
"iconv",
|
||||||
|
"iconv-light"
|
||||||
],
|
],
|
||||||
"author": "Dmitry Shirokov <deadrunk@gmail.com>",
|
"author": "Dmitry Shirokov <deadrunk@gmail.com>",
|
||||||
"contributors": [
|
"contributors": [
|
||||||
|
|
|
@ -5,6 +5,8 @@ import fs from 'fs';
|
||||||
describe('chardet', () => {
|
describe('chardet', () => {
|
||||||
|
|
||||||
const path = __dirname + '/test/data/encodings/utf8';
|
const path = __dirname + '/test/data/encodings/utf8';
|
||||||
|
const getInput = () => fs.readFileSync(path);
|
||||||
|
|
||||||
const expectedEncodingsFromPath = [
|
const expectedEncodingsFromPath = [
|
||||||
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
|
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
|
||||||
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
|
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
|
||||||
|
@ -25,8 +27,12 @@ describe('chardet', () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('#detect', () => {
|
describe('#detect', () => {
|
||||||
it('should detect encoding', () => {
|
it('should detect encoding from a buffer', () => {
|
||||||
expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8');
|
expect(chardet.detect(getInput())).toBe('UTF-8');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect encoding from a string', () => {
|
||||||
|
expect(chardet.detect(getInput().toString('utf-8'))).toBe('UTF-8');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -54,7 +60,12 @@ describe('chardet', () => {
|
||||||
|
|
||||||
describe('#analyse', () => {
|
describe('#analyse', () => {
|
||||||
it('should return a list of encodings, sorted by confidence level in decending order', () => {
|
it('should return a list of encodings, sorted by confidence level in decending order', () => {
|
||||||
const matches = chardet.analyse(fs.readFileSync(path));
|
const matches = chardet.analyse(getInput());
|
||||||
|
expect(matches).toEqual(expectedEncodingsFromPath);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work for strings as inputs', () => {
|
||||||
|
const matches = chardet.analyse(getInput().toString('utf8'));
|
||||||
expect(matches).toEqual(expectedEncodingsFromPath);
|
expect(matches).toEqual(expectedEncodingsFromPath);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -49,13 +49,16 @@ const recognisers: Recogniser[] = [
|
||||||
];
|
];
|
||||||
|
|
||||||
type DetectResult = Match[] | string | null;
|
type DetectResult = Match[] | string | null;
|
||||||
|
type InputData = Uint8Array | string;
|
||||||
|
|
||||||
export const detect = (buffer: Uint8Array): string | null => {
|
export const detect = (input: InputData): string | null => {
|
||||||
const matches: Match[] = analyse(buffer);
|
const matches: Match[] = analyse(input);
|
||||||
return matches.length > 0 ? matches[0].name : null;
|
return matches.length > 0 ? matches[0].name : null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const analyse = (buffer: Uint8Array): Match[] => {
|
export const analyse = (input: InputData): Match[] => {
|
||||||
|
const buffer = typeof input === 'string' ? Buffer.from(input) : input;
|
||||||
|
|
||||||
// Tally up the byte occurrence statistics.
|
// Tally up the byte occurrence statistics.
|
||||||
const fByteStats = [];
|
const fByteStats = [];
|
||||||
for (let i = 0; i < 256; i++) fByteStats[i] = 0;
|
for (let i = 0; i < 256; i++) fByteStats[i] = 0;
|
||||||
|
|
Loading…
Reference in New Issue