chardet/src/encoding/mbcs.ts

import { Context, Recogniser } from '.';
var match = require('../match').default;

/**
 * Binary search implementation (recursive)
 */
function binarySearch(arr: number[], searchValue: number) {
  const find = (
    arr: number[],
    searchValue: number,
    left: number,
    right: number
  ): number => {
    if (right < left) return -1;

    /*
    int mid = mid = (left + right) / 2;
    There is a bug in the above line;
    Joshua Bloch suggests the following replacement:
    */
    var mid = Math.floor((left + right) >>> 1);
    if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);

    if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);

    return mid;
  };

  return find(arr, searchValue, 0, arr.length - 1);
}

// 'Character'  iterated character class.
//    Recognizers for specific mbcs encodings make their 'characters' available
//    by providing a nextChar() function that fills in an instance of iteratedChar
//    with the next char from the input.
//    The returned characters are not converted to Unicode, but remain as the raw
//    bytes (concatenated into an int) from the codepage data.
//
//  For Asian charsets, use the raw input rather than the input that has been
//   stripped of markup.  Detection only considers multi-byte chars, effectively
//   stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar {
  charValue: number; // 1-4 bytes from the raw input data
  index: number;
  nextIndex: number;
  error: boolean;
  done: boolean;

  constructor() {
    this.charValue = 0; // 1-4 bytes from the raw input data
    this.index = 0;
    this.nextIndex = 0;
    this.error = false;
    this.done = false;
  }

  reset() {
    this.charValue = 0;
    this.index = -1;
    this.nextIndex = 0;
    this.error = false;
    this.done = false;
  }

  nextByte(det: Context) {
    if (this.nextIndex >= det.fRawLength) {
      this.done = true;
      return -1;
    }
    var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
    return byteValue;
  }
}

/**
 * Asian double or multi-byte - charsets.
 * Match is determined mostly by the input data adhering to the
 * encoding scheme for the charset, and, optionally,
 * frequency-of-occurrence of characters.
 */

class mbcs implements Recogniser {
  commonChars: number[] = [];

  name() {
    return 'mbcs';
  }

  /**
   * Test the match of this charset with the input text data
   *      which is obtained via the CharsetDetector object.
   *
   * @param det  The CharsetDetector, which contains the input text
   *             to be checked for being in this charset.
   * @return     Two values packed into one int  (Damn java, anyhow)
   *             bits 0-7:  the match confidence, ranging from 0-100
   *             bits 8-15: The match reason, an enum-like value.
   */
  match(det: Context) {
    var singleByteCharCount = 0, //TODO Do we really need this?
      doubleByteCharCount = 0,
      commonCharCount = 0,
      badCharCount = 0,
      totalCharCount = 0,
      confidence = 0;

    var iter = new IteratedChar();

    detectBlock: {
      for (iter.reset(); this.nextChar(iter, det); ) {
        totalCharCount++;
        if (iter.error) {
          badCharCount++;
        } else {
          var cv = iter.charValue & 0xffffffff;

          if (cv <= 0xff) {
            singleByteCharCount++;
          } else {
            doubleByteCharCount++;
            if (this.commonChars != null) {
              // NOTE: This assumes that there are no 4-byte common chars.
              if (binarySearch(this.commonChars, cv) >= 0) {
                commonCharCount++;
              }
            }
          }
        }
        if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
          // console.log('its here!')
          // Bail out early if the byte data is not matching the encoding scheme.
          break detectBlock;
        }
      }

      if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
          // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
          // We don't have enough data to have any confidence.
          // Statistical analysis of single byte non-ASCII characters would probably help here.
          confidence = 0;
        } else {
          //   ASCII or ISO file?  It's probably not our encoding,
          //   but is not incompatible with our encoding, so don't give it a zero.
          confidence = 10;
        }
        break detectBlock;
      }

      //
      //  No match if there are too many characters that don't fit the encoding scheme.
      //    (should we have zero tolerance for these?)
      //
      if (doubleByteCharCount < 20 * badCharCount) {
        confidence = 0;
        break detectBlock;
      }

      if (this.commonChars == null) {
        // We have no statistics on frequently occuring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better
        confidence = 30 + doubleByteCharCount - 20 * badCharCount;
        if (confidence > 100) {
          confidence = 100;
        }
      } else {
        //
        // Frequency of occurrence statistics exist.
        //
        // @ts-ignore
        var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
        var scaleFactor = 90.0 / maxVal;
        confidence = Math.floor(
          Math.log(commonCharCount + 1) * scaleFactor + 10
        );
        confidence = Math.min(confidence, 100);
      }
    } // end of detectBlock:

    return confidence == 0 ? null : match(det, this, confidence);
  }

  /**
   * Get the next character (however many bytes it is) from the input data
   *    Subclasses for specific charset encodings must implement this function
   *    to get characters according to the rules of their encoding scheme.
   *
   *  This function is not a method of class iteratedChar only because
   *   that would require a lot of extra derived classes, which is awkward.
   * @param it  The iteratedChar 'struct' into which the returned char is placed.
   * @param det The charset detector, which is needed to get at the input byte data
   *            being iterated over.
   * @return    True if a character was returned, false at end of input.
   */
  nextChar(iter: IteratedChar, det: Context): boolean {
    return true;
  }
}

/**
 * Shift-JIS charset recognizer.
 */
export class sjis extends mbcs {
  name() {
    return 'Shift-JIS';
  }
  language() {
    return 'ja';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0x8140,
    0x8141,
    0x8142,
    0x8145,
    0x815b,
    0x8169,
    0x816a,
    0x8175,
    0x8176,
    0x82a0,
    0x82a2,
    0x82a4,
    0x82a9,
    0x82aa,
    0x82ab,
    0x82ad,
    0x82af,
    0x82b1,
    0x82b3,
    0x82b5,
    0x82b7,
    0x82bd,
    0x82be,
    0x82c1,
    0x82c4,
    0x82c5,
    0x82c6,
    0x82c8,
    0x82c9,
    0x82cc,
    0x82cd,
    0x82dc,
    0x82e0,
    0x82e7,
    0x82e8,
    0x82e9,
    0x82ea,
    0x82f0,
    0x82f1,
    0x8341,
    0x8343,
    0x834e,
    0x834f,
    0x8358,
    0x835e,
    0x8362,
    0x8367,
    0x8375,
    0x8376,
    0x8389,
    0x838a,
    0x838b,
    0x838d,
    0x8393,
    0x8e96,
    0x93fa,
    0x95aa,
  ];

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;

    var firstByte;
    firstByte = iter.charValue = iter.nextByte(det);
    if (firstByte < 0) return false;

    if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
      return true;

    var secondByte = iter.nextByte(det);
    if (secondByte < 0) return false;

    iter.charValue = (firstByte << 8) | secondByte;
    if (
      !(
        (secondByte >= 0x40 && secondByte <= 0x7f) ||
        (secondByte >= 0x80 && secondByte <= 0xff)
      )
    ) {
      // Illegal second byte value.
      iter.error = true;
    }
    return true;
  }
}

/**
 *   Big5 charset recognizer.
 */
export class big5 extends mbcs {
  name() {
    return 'Big5';
  }
  language() {
    return 'zh';
  }
  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa140,
    0xa141,
    0xa142,
    0xa143,
    0xa147,
    0xa149,
    0xa175,
    0xa176,
    0xa440,
    0xa446,
    0xa447,
    0xa448,
    0xa451,
    0xa454,
    0xa457,
    0xa464,
    0xa46a,
    0xa46c,
    0xa477,
    0xa4a3,
    0xa4a4,
    0xa4a7,
    0xa4c1,
    0xa4ce,
    0xa4d1,
    0xa4df,
    0xa4e8,
    0xa4fd,
    0xa540,
    0xa548,
    0xa558,
    0xa569,
    0xa5cd,
    0xa5e7,
    0xa657,
    0xa661,
    0xa662,
    0xa668,
    0xa670,
    0xa6a8,
    0xa6b3,
    0xa6b9,
    0xa6d3,
    0xa6db,
    0xa6e6,
    0xa6f2,
    0xa740,
    0xa751,
    0xa759,
    0xa7da,
    0xa8a3,
    0xa8a5,
    0xa8ad,
    0xa8d1,
    0xa8d3,
    0xa8e4,
    0xa8fc,
    0xa9c0,
    0xa9d2,
    0xa9f3,
    0xaa6b,
    0xaaba,
    0xaabe,
    0xaacc,
    0xaafc,
    0xac47,
    0xac4f,
    0xacb0,
    0xacd2,
    0xad59,
    0xaec9,
    0xafe0,
    0xb0ea,
    0xb16f,
    0xb2b3,
    0xb2c4,
    0xb36f,
    0xb44c,
    0xb44e,
    0xb54c,
    0xb5a5,
    0xb5bd,
    0xb5d0,
    0xb5d8,
    0xb671,
    0xb7ed,
    0xb867,
    0xb944,
    0xbad8,
    0xbb44,
    0xbba1,
    0xbdd1,
    0xc2c4,
    0xc3b9,
    0xc440,
    0xc45f,
  ];

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;

    var firstByte = (iter.charValue = iter.nextByte(det));

    if (firstByte < 0) return false;

    // single byte character.
    if (firstByte <= 0x7f || firstByte == 0xff) return true;

    var secondByte = iter.nextByte(det);

    if (secondByte < 0) return false;

    iter.charValue = (iter.charValue << 8) | secondByte;

    if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
      iter.error = true;

    return true;
  }
}

/**
 *  EUC charset recognizers.  One abstract class that provides the common function
 *  for getting the next character according to the EUC encoding scheme,
 *  and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 *
 *  Get the next character value for EUC based encodings.
 *  Character 'value' is simply the raw bytes that make up the character
 *     packed into an int.
 */
function eucNextChar(iter: IteratedChar, det: Context) {
  iter.index = iter.nextIndex;
  iter.error = false;
  var firstByte = 0;
  var secondByte = 0;
  var thirdByte = 0;
  //int fourthByte = 0;
  buildChar: {
    firstByte = iter.charValue = iter.nextByte(det);
    if (firstByte < 0) {
      // Ran off the end of the input data
      iter.done = true;
      break buildChar;
    }
    if (firstByte <= 0x8d) {
      // single byte char
      break buildChar;
    }
    secondByte = iter.nextByte(det);
    iter.charValue = (iter.charValue << 8) | secondByte;
    if (firstByte >= 0xa1 && firstByte <= 0xfe) {
      // Two byte Char
      if (secondByte < 0xa1) {
        iter.error = true;
      }
      break buildChar;
    }
    if (firstByte == 0x8e) {
      // Code Set 2.
      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
      // We don't know which we've got.
      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
      //   bytes will look like a well formed 2 byte char.
      if (secondByte < 0xa1) {
        iter.error = true;
      }
      break buildChar;
    }
    if (firstByte == 0x8f) {
      // Code set 3.
      // Three byte total char size, two bytes of actual char value.
      thirdByte = iter.nextByte(det);
      iter.charValue = (iter.charValue << 8) | thirdByte;
      if (thirdByte < 0xa1) {
        iter.error = true;
      }
    }
  }
  return iter.done == false;
}

/**
 * The charset recognize for EUC-JP.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
export class euc_jp extends mbcs {
  name() {
    return 'EUC-JP';
  }
  language() {
    return 'ja';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa1a1,
    0xa1a2,
    0xa1a3,
    0xa1a6,
    0xa1bc,
    0xa1ca,
    0xa1cb,
    0xa1d6,
    0xa1d7,
    0xa4a2,
    0xa4a4,
    0xa4a6,
    0xa4a8,
    0xa4aa,
    0xa4ab,
    0xa4ac,
    0xa4ad,
    0xa4af,
    0xa4b1,
    0xa4b3,
    0xa4b5,
    0xa4b7,
    0xa4b9,
    0xa4bb,
    0xa4bd,
    0xa4bf,
    0xa4c0,
    0xa4c1,
    0xa4c3,
    0xa4c4,
    0xa4c6,
    0xa4c7,
    0xa4c8,
    0xa4c9,
    0xa4ca,
    0xa4cb,
    0xa4ce,
    0xa4cf,
    0xa4d0,
    0xa4de,
    0xa4df,
    0xa4e1,
    0xa4e2,
    0xa4e4,
    0xa4e8,
    0xa4e9,
    0xa4ea,
    0xa4eb,
    0xa4ec,
    0xa4ef,
    0xa4f2,
    0xa4f3,
    0xa5a2,
    0xa5a3,
    0xa5a4,
    0xa5a6,
    0xa5a7,
    0xa5aa,
    0xa5ad,
    0xa5af,
    0xa5b0,
    0xa5b3,
    0xa5b5,
    0xa5b7,
    0xa5b8,
    0xa5b9,
    0xa5bf,
    0xa5c3,
    0xa5c6,
    0xa5c7,
    0xa5c8,
    0xa5c9,
    0xa5cb,
    0xa5d0,
    0xa5d5,
    0xa5d6,
    0xa5d7,
    0xa5de,
    0xa5e0,
    0xa5e1,
    0xa5e5,
    0xa5e9,
    0xa5ea,
    0xa5eb,
    0xa5ec,
    0xa5ed,
    0xa5f3,
    0xb8a9,
    0xb9d4,
    0xbaee,
    0xbbc8,
    0xbef0,
    0xbfb7,
    0xc4ea,
    0xc6fc,
    0xc7bd,
    0xcab8,
    0xcaf3,
    0xcbdc,
    0xcdd1,
  ];

  nextChar = eucNextChar;
}

/**
 * The charset recognize for EUC-KR.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
export class euc_kr extends mbcs {
  name() {
    return 'EUC-KR';
  }

  language() {
    return 'ko';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xb0a1,
    0xb0b3,
    0xb0c5,
    0xb0cd,
    0xb0d4,
    0xb0e6,
    0xb0ed,
    0xb0f8,
    0xb0fa,
    0xb0fc,
    0xb1b8,
    0xb1b9,
    0xb1c7,
    0xb1d7,
    0xb1e2,
    0xb3aa,
    0xb3bb,
    0xb4c2,
    0xb4cf,
    0xb4d9,
    0xb4eb,
    0xb5a5,
    0xb5b5,
    0xb5bf,
    0xb5c7,
    0xb5e9,
    0xb6f3,
    0xb7af,
    0xb7c2,
    0xb7ce,
    0xb8a6,
    0xb8ae,
    0xb8b6,
    0xb8b8,
    0xb8bb,
    0xb8e9,
    0xb9ab,
    0xb9ae,
    0xb9cc,
    0xb9ce,
    0xb9fd,
    0xbab8,
    0xbace,
    0xbad0,
    0xbaf1,
    0xbbe7,
    0xbbf3,
    0xbbfd,
    0xbcad,
    0xbcba,
    0xbcd2,
    0xbcf6,
    0xbdba,
    0xbdc0,
    0xbdc3,
    0xbdc5,
    0xbec6,
    0xbec8,
    0xbedf,
    0xbeee,
    0xbef8,
    0xbefa,
    0xbfa1,
    0xbfa9,
    0xbfc0,
    0xbfe4,
    0xbfeb,
    0xbfec,
    0xbff8,
    0xc0a7,
    0xc0af,
    0xc0b8,
    0xc0ba,
    0xc0bb,
    0xc0bd,
    0xc0c7,
    0xc0cc,
    0xc0ce,
    0xc0cf,
    0xc0d6,
    0xc0da,
    0xc0e5,
    0xc0fb,
    0xc0fc,
    0xc1a4,
    0xc1a6,
    0xc1b6,
    0xc1d6,
    0xc1df,
    0xc1f6,
    0xc1f8,
    0xc4a1,
    0xc5cd,
    0xc6ae,
    0xc7cf,
    0xc7d1,
    0xc7d2,
    0xc7d8,
    0xc7e5,
    0xc8ad,
  ];

  nextChar = eucNextChar;
}

/**
 *   GB-18030 recognizer. Uses simplified Chinese statistics.
 */
export class gb_18030 extends mbcs {
  name() {
    return 'GB18030';
  }

  language() {
    return 'zh';
  }

  /*
   *  Get the next character value for EUC based encodings.
   *  Character 'value' is simply the raw bytes that make up the character
   *     packed into an int.
   */

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;
    var firstByte = 0;
    var secondByte = 0;
    var thirdByte = 0;
    var fourthByte = 0;
    buildChar: {
      firstByte = iter.charValue = iter.nextByte(det);
      if (firstByte < 0) {
        // Ran off the end of the input data
        iter.done = true;
        break buildChar;
      }
      if (firstByte <= 0x80) {
        // single byte char
        break buildChar;
      }
      secondByte = iter.nextByte(det);
      iter.charValue = (iter.charValue << 8) | secondByte;
      if (firstByte >= 0x81 && firstByte <= 0xfe) {
        // Two byte Char
        if (
          (secondByte >= 0x40 && secondByte <= 0x7e) ||
          (secondByte >= 80 && secondByte <= 0xfe)
        ) {
          break buildChar;
        }
        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
          thirdByte = iter.nextByte(det);
          if (thirdByte >= 0x81 && thirdByte <= 0xfe) {
            fourthByte = iter.nextByte(det);
            if (fourthByte >= 0x30 && fourthByte <= 0x39) {
              iter.charValue =
                (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
              break buildChar;
            }
          }
        }
        iter.error = true;
        break buildChar;
      }
    }
    return iter.done == false;
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa1a1,
    0xa1a2,
    0xa1a3,
    0xa1a4,
    0xa1b0,
    0xa1b1,
    0xa1f1,
    0xa1f3,
    0xa3a1,
    0xa3ac,
    0xa3ba,
    0xb1a8,
    0xb1b8,
    0xb1be,
    0xb2bb,
    0xb3c9,
    0xb3f6,
    0xb4f3,
    0xb5bd,
    0xb5c4,
    0xb5e3,
    0xb6af,
    0xb6d4,
    0xb6e0,
    0xb7a2,
    0xb7a8,
    0xb7bd,
    0xb7d6,
    0xb7dd,
    0xb8b4,
    0xb8df,
    0xb8f6,
    0xb9ab,
    0xb9c9,
    0xb9d8,
    0xb9fa,
    0xb9fd,
    0xbacd,
    0xbba7,
    0xbbd6,
    0xbbe1,
    0xbbfa,
    0xbcbc,
    0xbcdb,
    0xbcfe,
    0xbdcc,
    0xbecd,
    0xbedd,
    0xbfb4,
    0xbfc6,
    0xbfc9,
    0xc0b4,
    0xc0ed,
    0xc1cb,
    0xc2db,
    0xc3c7,
    0xc4dc,
    0xc4ea,
    0xc5cc,
    0xc6f7,
    0xc7f8,
    0xc8ab,
    0xc8cb,
    0xc8d5,
    0xc8e7,
    0xc9cf,
    0xc9fa,
    0xcab1,
    0xcab5,
    0xcac7,
    0xcad0,
    0xcad6,
    0xcaf5,
    0xcafd,
    0xccec,
    0xcdf8,
    0xceaa,
    0xcec4,
    0xced2,
    0xcee5,
    0xcfb5,
    0xcfc2,
    0xcfd6,
    0xd0c2,
    0xd0c5,
    0xd0d0,
    0xd0d4,
    0xd1a7,
    0xd2aa,
    0xd2b2,
    0xd2b5,
    0xd2bb,
    0xd2d4,
    0xd3c3,
    0xd3d0,
    0xd3fd,
    0xd4c2,
    0xd4da,
    0xd5e2,
    0xd6d0,
  ];
}
BREAKING CHANGE: Repo overhaul - Deprecation of callbacks in favour of promises - Deprecation of `detectAll`, `detectFileAll` and `detectFileAllSync` - use `analyse` fn instead. - Typescript typings now included as part of distribution - Modules support - Travis CI org => com migration - Lazy loading of `fs` module to enable usage in browser 2020-03-30 03:42:04 +00:00			`import { Context, Recogniser } from '.';`
			`var match = require('../match').default;`

			`/**`
			`* Binary search implementation (recursive)`
			`*/`
			`function binarySearch(arr: number[], searchValue: number) {`
			`const find = (`
			`arr: number[],`
			`searchValue: number,`
			`left: number,`
			`right: number`
			`): number => {`
			`if (right < left) return -1;`

			`/*`
			`int mid = mid = (left + right) / 2;`
			`There is a bug in the above line;`
			`Joshua Bloch suggests the following replacement:`
			`*/`
			`var mid = Math.floor((left + right) >>> 1);`
			`if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);`

			`if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);`

			`return mid;`
			`};`

			`return find(arr, searchValue, 0, arr.length - 1);`
			`}`

			`// 'Character' iterated character class.`
			`// Recognizers for specific mbcs encodings make their 'characters' available`
			`// by providing a nextChar() function that fills in an instance of iteratedChar`
			`// with the next char from the input.`
			`// The returned characters are not converted to Unicode, but remain as the raw`
			`// bytes (concatenated into an int) from the codepage data.`
			`//`
			`// For Asian charsets, use the raw input rather than the input that has been`
			`// stripped of markup. Detection only considers multi-byte chars, effectively`
			`// stripping markup anyway, and double byte chars do occur in markup too.`
			`//`
			`class IteratedChar {`
			`charValue: number; // 1-4 bytes from the raw input data`
			`index: number;`
			`nextIndex: number;`
			`error: boolean;`
			`done: boolean;`

			`constructor() {`
			`this.charValue = 0; // 1-4 bytes from the raw input data`
			`this.index = 0;`
			`this.nextIndex = 0;`
			`this.error = false;`
			`this.done = false;`
			`}`

			`reset() {`
			`this.charValue = 0;`
			`this.index = -1;`
			`this.nextIndex = 0;`
			`this.error = false;`
			`this.done = false;`
			`}`

			`nextByte(det: Context) {`
			`if (this.nextIndex >= det.fRawLength) {`
			`this.done = true;`
			`return -1;`
			`}`
			`var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;`
			`return byteValue;`
			`}`
			`}`

			`/**`
			`* Asian double or multi-byte - charsets.`
			`* Match is determined mostly by the input data adhering to the`
			`* encoding scheme for the charset, and, optionally,`
			`* frequency-of-occurrence of characters.`
			`*/`

			`class mbcs implements Recogniser {`
			`commonChars: number[] = [];`

			`name() {`
			`return 'mbcs';`
			`}`

			`/**`
			`* Test the match of this charset with the input text data`
			`* which is obtained via the CharsetDetector object.`
			`*`
			`* @param det The CharsetDetector, which contains the input text`
			`* to be checked for being in this charset.`
			`* @return Two values packed into one int (Damn java, anyhow)`
			`* bits 0-7: the match confidence, ranging from 0-100`
			`* bits 8-15: The match reason, an enum-like value.`
			`*/`
			`match(det: Context) {`
			`var singleByteCharCount = 0, //TODO Do we really need this?`
			`doubleByteCharCount = 0,`
			`commonCharCount = 0,`
			`badCharCount = 0,`
			`totalCharCount = 0,`
			`confidence = 0;`

			`var iter = new IteratedChar();`

			`detectBlock: {`
			`for (iter.reset(); this.nextChar(iter, det); ) {`
			`totalCharCount++;`
			`if (iter.error) {`
			`badCharCount++;`
			`} else {`
			`var cv = iter.charValue & 0xffffffff;`

			`if (cv <= 0xff) {`
			`singleByteCharCount++;`
			`} else {`
			`doubleByteCharCount++;`
			`if (this.commonChars != null) {`
			`// NOTE: This assumes that there are no 4-byte common chars.`
			`if (binarySearch(this.commonChars, cv) >= 0) {`
			`commonCharCount++;`
			`}`
			`}`
			`}`
			`}`
			`if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {`
			`// console.log('its here!')`
			`// Bail out early if the byte data is not matching the encoding scheme.`
			`break detectBlock;`
			`}`
			`}`

			`if (doubleByteCharCount <= 10 && badCharCount == 0) {`
			`// Not many multi-byte chars.`
			`if (doubleByteCharCount == 0 && totalCharCount < 10) {`
			`// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.`
			`// We don't have enough data to have any confidence.`
			`// Statistical analysis of single byte non-ASCII characters would probably help here.`
			`confidence = 0;`
			`} else {`
			`// ASCII or ISO file? It's probably not our encoding,`
			`// but is not incompatible with our encoding, so don't give it a zero.`
			`confidence = 10;`
			`}`
			`break detectBlock;`
			`}`

			`//`
			`// No match if there are too many characters that don't fit the encoding scheme.`
			`// (should we have zero tolerance for these?)`
			`//`
			`if (doubleByteCharCount < 20 * badCharCount) {`
			`confidence = 0;`
			`break detectBlock;`
			`}`

			`if (this.commonChars == null) {`
			`// We have no statistics on frequently occuring characters.`
			`// Assess confidence purely on having a reasonable number of`
			`// multi-byte characters (the more the better`
			`confidence = 30 + doubleByteCharCount - 20 * badCharCount;`
			`if (confidence > 100) {`
			`confidence = 100;`
			`}`
			`} else {`
			`//`
			`// Frequency of occurrence statistics exist.`
			`//`
			`// @ts-ignore`
			`var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);`
			`var scaleFactor = 90.0 / maxVal;`
			`confidence = Math.floor(`
			`Math.log(commonCharCount + 1) * scaleFactor + 10`
			`);`
			`confidence = Math.min(confidence, 100);`
			`}`
			`} // end of detectBlock:`

			`return confidence == 0 ? null : match(det, this, confidence);`
			`}`

			`/**`
			`* Get the next character (however many bytes it is) from the input data`
			`* Subclasses for specific charset encodings must implement this function`
			`* to get characters according to the rules of their encoding scheme.`
			`*`
			`* This function is not a method of class iteratedChar only because`
			`* that would require a lot of extra derived classes, which is awkward.`
			`* @param it The iteratedChar 'struct' into which the returned char is placed.`
			`* @param det The charset detector, which is needed to get at the input byte data`
			`* being iterated over.`
			`* @return True if a character was returned, false at end of input.`
			`*/`
			`nextChar(iter: IteratedChar, det: Context): boolean {`
			`return true;`
			`}`
			`}`

			`/**`
			`* Shift-JIS charset recognizer.`
			`*/`
			`export class sjis extends mbcs {`
			`name() {`
			`return 'Shift-JIS';`
			`}`
			`language() {`
			`return 'ja';`
			`}`

			`// TODO: This set of data comes from the character frequency-`
			`// of-occurrence analysis tool. The data needs to be moved`
			`// into a resource and loaded from there.`
			`commonChars = [`
			`0x8140,`
			`0x8141,`
			`0x8142,`
			`0x8145,`
			`0x815b,`
			`0x8169,`
			`0x816a,`
			`0x8175,`
			`0x8176,`
			`0x82a0,`
			`0x82a2,`
			`0x82a4,`
			`0x82a9,`
			`0x82aa,`
			`0x82ab,`
			`0x82ad,`
			`0x82af,`
			`0x82b1,`
			`0x82b3,`
			`0x82b5,`
			`0x82b7,`
			`0x82bd,`
			`0x82be,`
			`0x82c1,`
			`0x82c4,`
			`0x82c5,`
			`0x82c6,`
			`0x82c8,`
			`0x82c9,`
			`0x82cc,`
			`0x82cd,`
			`0x82dc,`
			`0x82e0,`
			`0x82e7,`
			`0x82e8,`
			`0x82e9,`
			`0x82ea,`
			`0x82f0,`
			`0x82f1,`
			`0x8341,`
			`0x8343,`
			`0x834e,`
			`0x834f,`
			`0x8358,`
			`0x835e,`
			`0x8362,`
			`0x8367,`
			`0x8375,`
			`0x8376,`
			`0x8389,`
			`0x838a,`
			`0x838b,`
			`0x838d,`
			`0x8393,`
			`0x8e96,`
			`0x93fa,`
			`0x95aa,`
			`];`

			`nextChar(iter: IteratedChar, det: Context) {`
			`iter.index = iter.nextIndex;`
			`iter.error = false;`

			`var firstByte;`
			`firstByte = iter.charValue = iter.nextByte(det);`
			`if (firstByte < 0) return false;`

			`if (firstByte <= 0x7f \|\| (firstByte > 0xa0 && firstByte <= 0xdf))`
			`return true;`

			`var secondByte = iter.nextByte(det);`
			`if (secondByte < 0) return false;`

			`iter.charValue = (firstByte << 8) \| secondByte;`
			`if (`
			`!(`
			`(secondByte >= 0x40 && secondByte <= 0x7f) \|\|`
			`(secondByte >= 0x80 && secondByte <= 0xff)`
			`)`
			`) {`
			`// Illegal second byte value.`
			`iter.error = true;`
			`}`
			`return true;`
			`}`
			`}`

			`/**`
			`* Big5 charset recognizer.`
			`*/`
			`export class big5 extends mbcs {`
			`name() {`
			`return 'Big5';`
			`}`
			`language() {`
			`return 'zh';`
			`}`
			`// TODO: This set of data comes from the character frequency-`
			`// of-occurrence analysis tool. The data needs to be moved`
			`// into a resource and loaded from there.`
			`commonChars = [`
			`0xa140,`
			`0xa141,`
			`0xa142,`
			`0xa143,`
			`0xa147,`
			`0xa149,`
			`0xa175,`
			`0xa176,`
			`0xa440,`
			`0xa446,`
			`0xa447,`
			`0xa448,`
			`0xa451,`
			`0xa454,`
			`0xa457,`
			`0xa464,`
			`0xa46a,`
			`0xa46c,`
			`0xa477,`
			`0xa4a3,`
			`0xa4a4,`
			`0xa4a7,`
			`0xa4c1,`
			`0xa4ce,`
			`0xa4d1,`
			`0xa4df,`
			`0xa4e8,`
			`0xa4fd,`
			`0xa540,`
			`0xa548,`
			`0xa558,`
			`0xa569,`
			`0xa5cd,`
			`0xa5e7,`
			`0xa657,`
			`0xa661,`
			`0xa662,`
			`0xa668,`
			`0xa670,`
			`0xa6a8,`
			`0xa6b3,`
			`0xa6b9,`
			`0xa6d3,`
			`0xa6db,`
			`0xa6e6,`
			`0xa6f2,`
			`0xa740,`
			`0xa751,`
			`0xa759,`
			`0xa7da,`
			`0xa8a3,`
			`0xa8a5,`
			`0xa8ad,`
			`0xa8d1,`
			`0xa8d3,`
			`0xa8e4,`
			`0xa8fc,`
			`0xa9c0,`
			`0xa9d2,`
			`0xa9f3,`
			`0xaa6b,`
			`0xaaba,`
			`0xaabe,`
			`0xaacc,`
			`0xaafc,`
			`0xac47,`
			`0xac4f,`
			`0xacb0,`
			`0xacd2,`
			`0xad59,`
			`0xaec9,`
			`0xafe0,`
			`0xb0ea,`
			`0xb16f,`
			`0xb2b3,`
			`0xb2c4,`
			`0xb36f,`
			`0xb44c,`
			`0xb44e,`
			`0xb54c,`
			`0xb5a5,`
			`0xb5bd,`
			`0xb5d0,`
			`0xb5d8,`
			`0xb671,`
			`0xb7ed,`
			`0xb867,`
			`0xb944,`
			`0xbad8,`
			`0xbb44,`
			`0xbba1,`
			`0xbdd1,`
			`0xc2c4,`
			`0xc3b9,`
			`0xc440,`
			`0xc45f,`
			`];`

			`nextChar(iter: IteratedChar, det: Context) {`
			`iter.index = iter.nextIndex;`
			`iter.error = false;`

			`var firstByte = (iter.charValue = iter.nextByte(det));`

			`if (firstByte < 0) return false;`

			`// single byte character.`
			`if (firstByte <= 0x7f \|\| firstByte == 0xff) return true;`

			`var secondByte = iter.nextByte(det);`

			`if (secondByte < 0) return false;`

			`iter.charValue = (iter.charValue << 8) \| secondByte;`

			`if (secondByte < 0x40 \|\| secondByte == 0x7f \|\| secondByte == 0xff)`
			`iter.error = true;`

			`return true;`
			`}`
			`}`

			`/**`
			`* EUC charset recognizers. One abstract class that provides the common function`
			`* for getting the next character according to the EUC encoding scheme,`
			`* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.`
			`*`
			`* Get the next character value for EUC based encodings.`
			`* Character 'value' is simply the raw bytes that make up the character`
			`* packed into an int.`
			`*/`
			`function eucNextChar(iter: IteratedChar, det: Context) {`
			`iter.index = iter.nextIndex;`
			`iter.error = false;`
			`var firstByte = 0;`
			`var secondByte = 0;`
			`var thirdByte = 0;`
			`//int fourthByte = 0;`
			`buildChar: {`
			`firstByte = iter.charValue = iter.nextByte(det);`
			`if (firstByte < 0) {`
			`// Ran off the end of the input data`
			`iter.done = true;`
			`break buildChar;`
			`}`
			`if (firstByte <= 0x8d) {`
			`// single byte char`
			`break buildChar;`
			`}`
			`secondByte = iter.nextByte(det);`
			`iter.charValue = (iter.charValue << 8) \| secondByte;`
			`if (firstByte >= 0xa1 && firstByte <= 0xfe) {`
			`// Two byte Char`
			`if (secondByte < 0xa1) {`
			`iter.error = true;`
			`}`
			`break buildChar;`
			`}`
			`if (firstByte == 0x8e) {`
			`// Code Set 2.`
			`// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.`
			`// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.`
			`// We don't know which we've got.`
			`// Treat it like EUC-JP. If the data really was EUC-TW, the following two`
			`// bytes will look like a well formed 2 byte char.`
			`if (secondByte < 0xa1) {`
			`iter.error = true;`
			`}`
			`break buildChar;`
			`}`
			`if (firstByte == 0x8f) {`
			`// Code set 3.`
			`// Three byte total char size, two bytes of actual char value.`
			`thirdByte = iter.nextByte(det);`
			`iter.charValue = (iter.charValue << 8) \| thirdByte;`
			`if (thirdByte < 0xa1) {`
			`iter.error = true;`
			`}`
			`}`
			`}`
			`return iter.done == false;`
			`}`

			`/**`
			`* The charset recognize for EUC-JP. A singleton instance of this class`
			`* is created and kept by the public CharsetDetector class`
			`*/`
			`export class euc_jp extends mbcs {`
			`name() {`
			`return 'EUC-JP';`
			`}`
			`language() {`
			`return 'ja';`
			`}`

			`// TODO: This set of data comes from the character frequency-`
			`// of-occurrence analysis tool. The data needs to be moved`
			`// into a resource and loaded from there.`
			`commonChars = [`
			`0xa1a1,`
			`0xa1a2,`
			`0xa1a3,`
			`0xa1a6,`
			`0xa1bc,`
			`0xa1ca,`
			`0xa1cb,`
			`0xa1d6,`
			`0xa1d7,`
			`0xa4a2,`
			`0xa4a4,`
			`0xa4a6,`
			`0xa4a8,`
			`0xa4aa,`
			`0xa4ab,`
			`0xa4ac,`
			`0xa4ad,`
			`0xa4af,`
			`0xa4b1,`
			`0xa4b3,`
			`0xa4b5,`
			`0xa4b7,`
			`0xa4b9,`
			`0xa4bb,`
			`0xa4bd,`
			`0xa4bf,`
			`0xa4c0,`
			`0xa4c1,`
			`0xa4c3,`
			`0xa4c4,`
			`0xa4c6,`
			`0xa4c7,`
			`0xa4c8,`
			`0xa4c9,`
			`0xa4ca,`
			`0xa4cb,`
			`0xa4ce,`
			`0xa4cf,`
			`0xa4d0,`
			`0xa4de,`
			`0xa4df,`
			`0xa4e1,`
			`0xa4e2,`
			`0xa4e4,`
			`0xa4e8,`
			`0xa4e9,`
			`0xa4ea,`
			`0xa4eb,`
			`0xa4ec,`
			`0xa4ef,`
			`0xa4f2,`
			`0xa4f3,`
			`0xa5a2,`
			`0xa5a3,`
			`0xa5a4,`
			`0xa5a6,`
			`0xa5a7,`
			`0xa5aa,`
			`0xa5ad,`
			`0xa5af,`
			`0xa5b0,`
			`0xa5b3,`
			`0xa5b5,`
			`0xa5b7,`
			`0xa5b8,`
			`0xa5b9,`
			`0xa5bf,`
			`0xa5c3,`
			`0xa5c6,`
			`0xa5c7,`
			`0xa5c8,`
			`0xa5c9,`
			`0xa5cb,`
			`0xa5d0,`
			`0xa5d5,`
			`0xa5d6,`
			`0xa5d7,`
			`0xa5de,`
			`0xa5e0,`
			`0xa5e1,`
			`0xa5e5,`
			`0xa5e9,`
			`0xa5ea,`
			`0xa5eb,`
			`0xa5ec,`
			`0xa5ed,`
			`0xa5f3,`
			`0xb8a9,`
			`0xb9d4,`
			`0xbaee,`
			`0xbbc8,`
			`0xbef0,`
			`0xbfb7,`
			`0xc4ea,`
			`0xc6fc,`
			`0xc7bd,`
			`0xcab8,`
			`0xcaf3,`
			`0xcbdc,`
			`0xcdd1,`
			`];`

			`nextChar = eucNextChar;`
			`}`

			`/**`
			`* The charset recognize for EUC-KR. A singleton instance of this class`
			`* is created and kept by the public CharsetDetector class`
			`*/`
			`export class euc_kr extends mbcs {`
			`name() {`
			`return 'EUC-KR';`
			`}`

			`language() {`
			`return 'ko';`
			`}`

			`// TODO: This set of data comes from the character frequency-`
			`// of-occurrence analysis tool. The data needs to be moved`
			`// into a resource and loaded from there.`
			`commonChars = [`
			`0xb0a1,`
			`0xb0b3,`
			`0xb0c5,`
			`0xb0cd,`
			`0xb0d4,`
			`0xb0e6,`
			`0xb0ed,`
			`0xb0f8,`
			`0xb0fa,`
			`0xb0fc,`
			`0xb1b8,`
			`0xb1b9,`
			`0xb1c7,`
			`0xb1d7,`
			`0xb1e2,`
			`0xb3aa,`
			`0xb3bb,`
			`0xb4c2,`
			`0xb4cf,`
			`0xb4d9,`
			`0xb4eb,`
			`0xb5a5,`
			`0xb5b5,`
			`0xb5bf,`
			`0xb5c7,`
			`0xb5e9,`
			`0xb6f3,`
			`0xb7af,`
			`0xb7c2,`
			`0xb7ce,`
			`0xb8a6,`
			`0xb8ae,`
			`0xb8b6,`
			`0xb8b8,`
			`0xb8bb,`
			`0xb8e9,`
			`0xb9ab,`
			`0xb9ae,`
			`0xb9cc,`
			`0xb9ce,`
			`0xb9fd,`
			`0xbab8,`
			`0xbace,`
			`0xbad0,`
			`0xbaf1,`
			`0xbbe7,`
			`0xbbf3,`
			`0xbbfd,`
			`0xbcad,`
			`0xbcba,`
			`0xbcd2,`
			`0xbcf6,`
			`0xbdba,`
			`0xbdc0,`
			`0xbdc3,`
			`0xbdc5,`
			`0xbec6,`
			`0xbec8,`
			`0xbedf,`
			`0xbeee,`
			`0xbef8,`
			`0xbefa,`
			`0xbfa1,`
			`0xbfa9,`
			`0xbfc0,`
			`0xbfe4,`
			`0xbfeb,`
			`0xbfec,`
			`0xbff8,`
			`0xc0a7,`
			`0xc0af,`
			`0xc0b8,`
			`0xc0ba,`
			`0xc0bb,`
			`0xc0bd,`
			`0xc0c7,`
			`0xc0cc,`
			`0xc0ce,`
			`0xc0cf,`
			`0xc0d6,`
			`0xc0da,`
			`0xc0e5,`
			`0xc0fb,`
			`0xc0fc,`
			`0xc1a4,`
			`0xc1a6,`
			`0xc1b6,`
			`0xc1d6,`
			`0xc1df,`
			`0xc1f6,`
			`0xc1f8,`
			`0xc4a1,`
			`0xc5cd,`
			`0xc6ae,`
			`0xc7cf,`
			`0xc7d1,`
			`0xc7d2,`
			`0xc7d8,`
			`0xc7e5,`
			`0xc8ad,`
			`];`

			`nextChar = eucNextChar;`
			`}`

			`/**`
			`* GB-18030 recognizer. Uses simplified Chinese statistics.`
			`*/`
			`export class gb_18030 extends mbcs {`
			`name() {`
			`return 'GB18030';`
			`}`

			`language() {`
			`return 'zh';`
			`}`

			`/*`
			`* Get the next character value for EUC based encodings.`
			`* Character 'value' is simply the raw bytes that make up the character`
			`* packed into an int.`
			`*/`

			`nextChar(iter: IteratedChar, det: Context) {`
			`iter.index = iter.nextIndex;`
			`iter.error = false;`
			`var firstByte = 0;`
			`var secondByte = 0;`
			`var thirdByte = 0;`
			`var fourthByte = 0;`
			`buildChar: {`
			`firstByte = iter.charValue = iter.nextByte(det);`
			`if (firstByte < 0) {`
			`// Ran off the end of the input data`
			`iter.done = true;`
			`break buildChar;`
			`}`
			`if (firstByte <= 0x80) {`
			`// single byte char`
			`break buildChar;`
			`}`
			`secondByte = iter.nextByte(det);`
			`iter.charValue = (iter.charValue << 8) \| secondByte;`
			`if (firstByte >= 0x81 && firstByte <= 0xfe) {`
			`// Two byte Char`
			`if (`
			`(secondByte >= 0x40 && secondByte <= 0x7e) \|\|`
			`(secondByte >= 80 && secondByte <= 0xfe)`
			`) {`
			`break buildChar;`
			`}`
			`// Four byte char`
			`if (secondByte >= 0x30 && secondByte <= 0x39) {`
			`thirdByte = iter.nextByte(det);`
			`if (thirdByte >= 0x81 && thirdByte <= 0xfe) {`
			`fourthByte = iter.nextByte(det);`
			`if (fourthByte >= 0x30 && fourthByte <= 0x39) {`
			`iter.charValue =`
			`(iter.charValue << 16) \| (thirdByte << 8) \| fourthByte;`
			`break buildChar;`
			`}`
			`}`
			`}`
			`iter.error = true;`
			`break buildChar;`
			`}`
			`}`
			`return iter.done == false;`
			`}`

			`// TODO: This set of data comes from the character frequency-`
			`// of-occurrence analysis tool. The data needs to be moved`
			`// into a resource and loaded from there.`
			`commonChars = [`
			`0xa1a1,`
			`0xa1a2,`
			`0xa1a3,`
			`0xa1a4,`
			`0xa1b0,`
			`0xa1b1,`
			`0xa1f1,`
			`0xa1f3,`
			`0xa3a1,`
			`0xa3ac,`
			`0xa3ba,`
			`0xb1a8,`
			`0xb1b8,`
			`0xb1be,`
			`0xb2bb,`
			`0xb3c9,`
			`0xb3f6,`
			`0xb4f3,`
			`0xb5bd,`
			`0xb5c4,`
			`0xb5e3,`
			`0xb6af,`
			`0xb6d4,`
			`0xb6e0,`
			`0xb7a2,`
			`0xb7a8,`
			`0xb7bd,`
			`0xb7d6,`
			`0xb7dd,`
			`0xb8b4,`
			`0xb8df,`
			`0xb8f6,`
			`0xb9ab,`
			`0xb9c9,`
			`0xb9d8,`
			`0xb9fa,`
			`0xb9fd,`
			`0xbacd,`
			`0xbba7,`
			`0xbbd6,`
			`0xbbe1,`
			`0xbbfa,`
			`0xbcbc,`
			`0xbcdb,`
			`0xbcfe,`
			`0xbdcc,`
			`0xbecd,`
			`0xbedd,`
			`0xbfb4,`
			`0xbfc6,`
			`0xbfc9,`
			`0xc0b4,`
			`0xc0ed,`
			`0xc1cb,`
			`0xc2db,`
			`0xc3c7,`
			`0xc4dc,`
			`0xc4ea,`
			`0xc5cc,`
			`0xc6f7,`
			`0xc7f8,`
			`0xc8ab,`
			`0xc8cb,`
			`0xc8d5,`
			`0xc8e7,`
			`0xc9cf,`
			`0xc9fa,`
			`0xcab1,`
			`0xcab5,`
			`0xcac7,`
			`0xcad0,`
			`0xcad6,`
			`0xcaf5,`
			`0xcafd,`
			`0xccec,`
			`0xcdf8,`
			`0xceaa,`
			`0xcec4,`
			`0xced2,`
			`0xcee5,`
			`0xcfb5,`
			`0xcfc2,`
			`0xcfd6,`
			`0xd0c2,`
			`0xd0c5,`
			`0xd0d0,`
			`0xd0d4,`
			`0xd1a7,`
			`0xd2aa,`
			`0xd2b2,`
			`0xd2b5,`
			`0xd2bb,`
			`0xd2d4,`
			`0xd3c3,`
			`0xd3d0,`
			`0xd3fd,`
			`0xd4c2,`
			`0xd4da,`
			`0xd5e2,`
			`0xd6d0,`
			`];`
			`}`