feat: Support strings as inputs

This commit is contained in:
Dmitry Shirokov 2020-09-25 10:27:21 +10:00
parent b1761347d5
commit 2fb68b1f48
No known key found for this signature in database
GPG Key ID: 0D8CF8C72764BA46
5 changed files with 40 additions and 11 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ node_modules
coverage
npm-debug.log
lib
TODO.md

View File

@ -1,8 +1,14 @@
# chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet)
Chardet is a character detection module for NodeJS written in pure Javascript.
Module is based on ICU project http://site.icu-project.org/, which uses character
occurency analysis to determine the most probable encoding.
*Chardet* is a character detection module written in pure Javascript (Typescript). Module uses occurrence analysis to determine the most probable encoding.
- Packed size is only **22 KB**
- No dependencies
- No native code / bindings
- Works in all environments: Node / Browser / Native
- Works on all platforms: Linux / Mac / Windows
- 100% written in Typescript
- Extensive code coverage
## Installation
@ -87,3 +93,7 @@ Currently only these encodings are supported.
## Typescript?
Yes. Type definitions are included.
### References
- ICU project http://site.icu-project.org/

View File

@ -2,7 +2,7 @@
"name": "chardet",
"version": "0.0.0-development",
"homepage": "https://github.com/runk/node-chardet",
"description": "Character detector",
"description": "Character encoding detector",
"license": "MIT",
"repository": {
"type": "git",
@ -50,7 +50,11 @@
"utf8",
"detector",
"chardet",
"icu"
"icu",
"character detection",
"character encoding",
"iconv",
"iconv-light"
],
"author": "Dmitry Shirokov <deadrunk@gmail.com>",
"contributors": [

View File

@ -5,6 +5,8 @@ import fs from 'fs';
describe('chardet', () => {
const path = __dirname + '/test/data/encodings/utf8';
const getInput = () => fs.readFileSync(path);
const expectedEncodingsFromPath = [
{ 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
{ 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
@ -25,8 +27,12 @@ describe('chardet', () => {
});
describe('#detect', () => {
it('should detect encoding', () => {
expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8');
it('should detect encoding from a buffer', () => {
expect(chardet.detect(getInput())).toBe('UTF-8');
});
it('should detect encoding from a string', () => {
expect(chardet.detect(getInput().toString('utf-8'))).toBe('UTF-8');
});
});
@ -54,7 +60,12 @@ describe('chardet', () => {
describe('#analyse', () => {
it('should return a list of encodings, sorted by confidence level in decending order', () => {
const matches = chardet.analyse(fs.readFileSync(path));
const matches = chardet.analyse(getInput());
expect(matches).toEqual(expectedEncodingsFromPath);
});
it('should work for strings as inputs', () => {
const matches = chardet.analyse(getInput().toString('utf8'));
expect(matches).toEqual(expectedEncodingsFromPath);
});
});

View File

@ -49,13 +49,16 @@ const recognisers: Recogniser[] = [
];
type DetectResult = Match[] | string | null;
type InputData = Uint8Array | string;
export const detect = (buffer: Uint8Array): string | null => {
const matches: Match[] = analyse(buffer);
export const detect = (input: InputData): string | null => {
const matches: Match[] = analyse(input);
return matches.length > 0 ? matches[0].name : null;
};
export const analyse = (buffer: Uint8Array): Match[] => {
export const analyse = (input: InputData): Match[] => {
const buffer = typeof input === 'string' ? Buffer.from(input) : input;
// Tally up the byte occurrence statistics.
const fByteStats = [];
for (let i = 0; i < 256; i++) fByteStats[i] = 0;