diff --git a/README.md b/README.md index 7da2139..6563a85 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ import chardet from 'chardet'; chardet.analyse(Buffer.from('hello there!')); ``` -Returned value is an array of objects sorted by confidence value in decending order +Returned value is an array of objects sorted by confidence value in descending order ```javascript [ @@ -48,8 +48,8 @@ Returned value is an array of objects sorted by confidence value in decending or ## Working with large data sets -Sometimes, when data set is huge and you want to optimize performace (with a tradeoff of less accuracy), -you can sample only first N bytes of the buffer: +Sometimes, when data set is huge and you want to optimize performance (with a tradeoff of less accuracy), +you can sample only the first N bytes of the buffer: ```javascript chardet @@ -57,6 +57,14 @@ chardet .then(encoding => console.log(encoding)); ``` +You can also specify where to begin reading from in the buffer: + +```javascript +chardet + .detectFile('/path/to/file', { sampleSize: 32, offset: 128 }) + .then(encoding => console.log(encoding)); +``` + ## Supported Encodings: - UTF-8 diff --git a/src/index.test.ts b/src/index.test.ts index d1eba76..56b4c62 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -40,6 +40,11 @@ describe('chardet', () => { const res = await chardet.detectFile(path, { sampleSize: 32 }); expect(res).toBe('UTF-8'); }); + + it('should detect encoding with smaller sample size and offset', async () => { + const res = await chardet.detectFile(path, { sampleSize: 32, offset: 64 }); + expect(res).toBe('UTF-8'); + }); }); describe('#detectFileSync', () => { @@ -50,6 +55,10 @@ describe('chardet', () => { it('should detect encoding with smaller sample size', () => { expect(chardet.detectFileSync(path, { sampleSize: 32 })).toBe('UTF-8'); }); + + it('should detect encoding with smaller sample size and offset', () => { + expect(chardet.detectFileSync(path, { sampleSize: 32, offset: 64 })).toBe('UTF-8'); + }); }); describe('#analyse', () => { diff --git a/src/index.ts b/src/index.ts index 2030222..a43ade3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -10,7 +10,8 @@ import * as sbcs from './encoding/sbcs'; import * as iso2022 from './encoding/iso2022'; interface FullOptions { - sampleSize: number + sampleSize: number, + offset: number } type Options = Partial @@ -107,7 +108,7 @@ export const detectFile = (filepath: string, opts: Options = {}): Promise { + fs.read(fd, sample, 0, opts.sampleSize, opts.offset, (err?: Error) => { handler(err, sample); }); return; @@ -123,7 +124,7 @@ export const detectFileSync = (filepath: string, opts: Options = {}): DetectResu const fd = fs.openSync(filepath, 'r'); const sample = Buffer.allocUnsafe(opts.sampleSize); - fs.readSync(fd, sample, 0, opts.sampleSize); + fs.readSync(fd, sample, 0, opts.sampleSize, opts.offset); fs.closeSync(fd); return detect(sample); }