Merge pull request #64 from crisp-dev/master

feat: allow position offset as option
2022-10-09 22:47:30 +11:00 · 2022-10-09 22:47:30 +11:00 · 685cba81b3
parent 71e8016266 bab1db0a77
commit 685cba81b3
3 changed files with 24 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ import chardet from 'chardet';
 chardet.analyse(Buffer.from('hello there!'));
 ```

-Returned value is an array of objects sorted by confidence value in decending order
+Returned value is an array of objects sorted by confidence value in descending order

 ```javascript
 [
@ -48,8 +48,8 @@ Returned value is an array of objects sorted by confidence value in decending or

 ## Working with large data sets

-Sometimes, when data set is huge and you want to optimize performace (with a tradeoff of less accuracy),
-you can sample only first N bytes of the buffer:
+Sometimes, when data set is huge and you want to optimize performance (with a tradeoff of less accuracy),
+you can sample only the first N bytes of the buffer:

 ```javascript
 chardet
@ -57,6 +57,14 @@ chardet
  .then(encoding => console.log(encoding));
 ```

+You can also specify where to begin reading from in the buffer:
+
+```javascript
+chardet
+  .detectFile('/path/to/file', { sampleSize: 32, offset: 128 })
+  .then(encoding => console.log(encoding));
+```
+
 ## Supported Encodings:

 - UTF-8
--- a/src/index.test.ts
+++ b/src/index.test.ts
@ -40,6 +40,11 @@ describe('chardet', () => {
      const res = await chardet.detectFile(path, { sampleSize: 32 });
      expect(res).toBe('UTF-8');
    });
+
+    it('should detect encoding with smaller sample size and offset', async () => {
+      const res = await chardet.detectFile(path, { sampleSize: 32, offset: 64 });
+      expect(res).toBe('UTF-8');
+    });
  });

  describe('#detectFileSync', () => {
@ -50,6 +55,10 @@ describe('chardet', () => {
    it('should detect encoding with smaller sample size', () => {
      expect(chardet.detectFileSync(path, { sampleSize: 32 })).toBe('UTF-8');
    });
+
+    it('should detect encoding with smaller sample size and offset', () => {
+      expect(chardet.detectFileSync(path, { sampleSize: 32, offset: 64 })).toBe('UTF-8');
+    });
  });

  describe('#analyse', () => {
--- a/src/index.ts
+++ b/src/index.ts
@ -10,7 +10,8 @@ import * as sbcs from './encoding/sbcs';
 import * as iso2022 from './encoding/iso2022';

 interface FullOptions {
-  sampleSize: number
+  sampleSize: number,
+  offset: number
 }

 type Options = Partial<FullOptions>
@ -107,7 +108,7 @@ export const detectFile = (filepath: string, opts: Options = {}): Promise<Detect
      fd = fs.openSync(filepath, 'r');
      const sample: Buffer = Buffer.allocUnsafe(opts.sampleSize);

-      fs.read(fd, sample, 0, opts.sampleSize, null, (err?: Error) => {
+      fs.read(fd, sample, 0, opts.sampleSize, opts.offset, (err?: Error) => {
        handler(err, sample);
      });
      return;
@ -123,7 +124,7 @@ export const detectFileSync = (filepath: string, opts: Options = {}): DetectResu
    const fd = fs.openSync(filepath, 'r');
    const sample = Buffer.allocUnsafe(opts.sampleSize);

-    fs.readSync(fd, sample, 0, opts.sampleSize);
+    fs.readSync(fd, sample, 0, opts.sampleSize, opts.offset);
    fs.closeSync(fd);
    return detect(sample);
  }