From 2fb68b1f485484e524de0635869e57804d6b1c8a Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Fri, 25 Sep 2020 10:27:21 +1000 Subject: [PATCH] feat: Support strings as inputs --- .gitignore | 1 + README.md | 16 +++++++++++++--- package.json | 8 ++++++-- src/index.test.ts | 17 ++++++++++++++--- src/index.ts | 9 ++++++--- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 0d608f6..05061fb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ node_modules coverage npm-debug.log lib +TODO.md diff --git a/README.md b/README.md index d49d4dd..e8efd01 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,14 @@ # chardet [![Build Status](https://travis-ci.org/runk/node-chardet.png)](https://travis-ci.org/runk/node-chardet) -Chardet is a character detection module for NodeJS written in pure Javascript. -Module is based on ICU project http://site.icu-project.org/, which uses character -occurency analysis to determine the most probable encoding. +*Chardet* is a character detection module written in pure Javascript (Typescript). Module uses occurrence analysis to determine the most probable encoding. + +- Packed size is only **22 KB** +- No dependencies +- No native code / bindings +- Works in all environments: Node / Browser / Native +- Works on all platforms: Linux / Mac / Windows +- 100% written in Typescript +- Extensive code coverage ## Installation @@ -87,3 +93,7 @@ Currently only these encodings are supported. ## Typescript? Yes. Type definitions are included. + +### References + +- ICU project http://site.icu-project.org/ diff --git a/package.json b/package.json index de04c47..c7cde62 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "chardet", "version": "0.0.0-development", "homepage": "https://github.com/runk/node-chardet", - "description": "Character detector", + "description": "Character encoding detector", "license": "MIT", "repository": { "type": "git", @@ -50,7 +50,11 @@ "utf8", "detector", "chardet", - "icu" + "icu", + "character detection", + "character encoding", + "iconv", + "iconv-light" ], "author": "Dmitry Shirokov ", "contributors": [ diff --git a/src/index.test.ts b/src/index.test.ts index 6a1f403..d0c1cc0 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -5,6 +5,8 @@ import fs from 'fs'; describe('chardet', () => { const path = __dirname + '/test/data/encodings/utf8'; + const getInput = () => fs.readFileSync(path); + const expectedEncodingsFromPath = [ { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, @@ -25,8 +27,12 @@ describe('chardet', () => { }); describe('#detect', () => { - it('should detect encoding', () => { - expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8'); + it('should detect encoding from a buffer', () => { + expect(chardet.detect(getInput())).toBe('UTF-8'); + }); + + it('should detect encoding from a string', () => { + expect(chardet.detect(getInput().toString('utf-8'))).toBe('UTF-8'); }); }); @@ -54,7 +60,12 @@ describe('chardet', () => { describe('#analyse', () => { it('should return a list of encodings, sorted by confidence level in decending order', () => { - const matches = chardet.analyse(fs.readFileSync(path)); + const matches = chardet.analyse(getInput()); + expect(matches).toEqual(expectedEncodingsFromPath); + }); + + it('should work for strings as inputs', () => { + const matches = chardet.analyse(getInput().toString('utf8')); expect(matches).toEqual(expectedEncodingsFromPath); }); }); diff --git a/src/index.ts b/src/index.ts index e5edcb7..f9ceba0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -49,13 +49,16 @@ const recognisers: Recogniser[] = [ ]; type DetectResult = Match[] | string | null; +type InputData = Uint8Array | string; -export const detect = (buffer: Uint8Array): string | null => { - const matches: Match[] = analyse(buffer); +export const detect = (input: InputData): string | null => { + const matches: Match[] = analyse(input); return matches.length > 0 ? matches[0].name : null; }; -export const analyse = (buffer: Uint8Array): Match[] => { +export const analyse = (input: InputData): Match[] => { + const buffer = typeof input === 'string' ? Buffer.from(input) : input; + // Tally up the byte occurrence statistics. const fByteStats = []; for (let i = 0; i < 256; i++) fByteStats[i] = 0;