Merge pull request #64 from crisp-dev/master

feat: allow position offset as option
This commit is contained in:
Dmitry Shirokov 2022-10-09 22:47:30 +11:00 committed by GitHub
commit 685cba81b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 6 deletions

View File

@ -37,7 +37,7 @@ import chardet from 'chardet';
chardet.analyse(Buffer.from('hello there!'));
```
Returned value is an array of objects sorted by confidence value in decending order
Returned value is an array of objects sorted by confidence value in descending order
```javascript
[
@ -48,8 +48,8 @@ Returned value is an array of objects sorted by confidence value in decending or
## Working with large data sets
Sometimes, when data set is huge and you want to optimize performace (with a tradeoff of less accuracy),
you can sample only first N bytes of the buffer:
Sometimes, when data set is huge and you want to optimize performance (with a tradeoff of less accuracy),
you can sample only the first N bytes of the buffer:
```javascript
chardet
@ -57,6 +57,14 @@ chardet
.then(encoding => console.log(encoding));
```
You can also specify where to begin reading from in the buffer:
```javascript
chardet
.detectFile('/path/to/file', { sampleSize: 32, offset: 128 })
.then(encoding => console.log(encoding));
```
## Supported Encodings:
- UTF-8

View File

@ -40,6 +40,11 @@ describe('chardet', () => {
const res = await chardet.detectFile(path, { sampleSize: 32 });
expect(res).toBe('UTF-8');
});
it('should detect encoding with smaller sample size and offset', async () => {
const res = await chardet.detectFile(path, { sampleSize: 32, offset: 64 });
expect(res).toBe('UTF-8');
});
});
describe('#detectFileSync', () => {
@ -50,6 +55,10 @@ describe('chardet', () => {
it('should detect encoding with smaller sample size', () => {
expect(chardet.detectFileSync(path, { sampleSize: 32 })).toBe('UTF-8');
});
it('should detect encoding with smaller sample size and offset', () => {
expect(chardet.detectFileSync(path, { sampleSize: 32, offset: 64 })).toBe('UTF-8');
});
});
describe('#analyse', () => {

View File

@ -10,7 +10,8 @@ import * as sbcs from './encoding/sbcs';
import * as iso2022 from './encoding/iso2022';
interface FullOptions {
sampleSize: number
sampleSize: number,
offset: number
}
type Options = Partial<FullOptions>
@ -107,7 +108,7 @@ export const detectFile = (filepath: string, opts: Options = {}): Promise<Detect
fd = fs.openSync(filepath, 'r');
const sample: Buffer = Buffer.allocUnsafe(opts.sampleSize);
fs.read(fd, sample, 0, opts.sampleSize, null, (err?: Error) => {
fs.read(fd, sample, 0, opts.sampleSize, opts.offset, (err?: Error) => {
handler(err, sample);
});
return;
@ -123,7 +124,7 @@ export const detectFileSync = (filepath: string, opts: Options = {}): DetectResu
const fd = fs.openSync(filepath, 'r');
const sample = Buffer.allocUnsafe(opts.sampleSize);
fs.readSync(fd, sample, 0, opts.sampleSize);
fs.readSync(fd, sample, 0, opts.sampleSize, opts.offset);
fs.closeSync(fd);
return detect(sample);
}