chardet/src/encoding/mbcs.ts

import { Context, Recogniser } from '.';
import match, { Match } from '../match';

/**
 * Binary search implementation (recursive)
 */
function binarySearch(arr: number[], searchValue: number) {
  const find = (
    arr: number[],
    searchValue: number,
    left: number,
    right: number
  ): number => {
    if (right < left) return -1;

    /*
    int mid = mid = (left + right) / 2;
    There is a bug in the above line;
    Joshua Bloch suggests the following replacement:
    */
    const mid = Math.floor((left + right) >>> 1);
    if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);

    if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);

    return mid;
  };

  return find(arr, searchValue, 0, arr.length - 1);
}

// 'Character'  iterated character class.
//    Recognizers for specific mbcs encodings make their 'characters' available
//    by providing a nextChar() function that fills in an instance of iteratedChar
//    with the next char from the input.
//    The returned characters are not converted to Unicode, but remain as the raw
//    bytes (concatenated into an int) from the codepage data.
//
//  For Asian charsets, use the raw input rather than the input that has been
//   stripped of markup.  Detection only considers multi-byte chars, effectively
//   stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar {
  charValue: number; // 1-4 bytes from the raw input data
  index: number;
  nextIndex: number;
  error: boolean;
  done: boolean;

  constructor() {
    this.charValue = 0; // 1-4 bytes from the raw input data
    this.index = 0;
    this.nextIndex = 0;
    this.error = false;
    this.done = false;
  }

  reset() {
    this.charValue = 0;
    this.index = -1;
    this.nextIndex = 0;
    this.error = false;
    this.done = false;
  }

  nextByte(det: Context) {
    if (this.nextIndex >= det.fRawLength) {
      this.done = true;
      return -1;
    }
    const byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
    return byteValue;
  }
}

/**
 * Asian double or multi-byte - charsets.
 * Match is determined mostly by the input data adhering to the
 * encoding scheme for the charset, and, optionally,
 * frequency-of-occurrence of characters.
 */

class mbcs implements Recogniser {
  commonChars: number[] = [];

  name() {
    return 'mbcs';
  }

  /**
   * Test the match of this charset with the input text data
   *      which is obtained via the CharsetDetector object.
   *
   * @param det  The CharsetDetector, which contains the input text
   *             to be checked for being in this charset.
   * @return     Two values packed into one int  (Damn java, anyhow)
   *             bits 0-7:  the match confidence, ranging from 0-100
   *             bits 8-15: The match reason, an enum-like value.
   */
  match(det: Context): Match | null {
    let singleByteCharCount = 0, //TODO Do we really need this?
      doubleByteCharCount = 0,
      commonCharCount = 0,
      badCharCount = 0,
      totalCharCount = 0,
      confidence = 0;

    const iter = new IteratedChar();

    detectBlock: {
      for (iter.reset(); this.nextChar(iter, det); ) {
        totalCharCount++;
        if (iter.error) {
          badCharCount++;
        } else {
          const cv = iter.charValue & 0xffffffff;

          if (cv <= 0xff) {
            singleByteCharCount++;
          } else {
            doubleByteCharCount++;
            if (this.commonChars != null) {
              // NOTE: This assumes that there are no 4-byte common chars.
              if (binarySearch(this.commonChars, cv) >= 0) {
                commonCharCount++;
              }
            }
          }
        }
        if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
          // console.log('its here!')
          // Bail out early if the byte data is not matching the encoding scheme.
          break detectBlock;
        }
      }

      if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
          // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
          // We don't have enough data to have any confidence.
          // Statistical analysis of single byte non-ASCII characters would probably help here.
          confidence = 0;
        } else {
          //   ASCII or ISO file?  It's probably not our encoding,
          //   but is not incompatible with our encoding, so don't give it a zero.
          confidence = 10;
        }
        break detectBlock;
      }

      //
      //  No match if there are too many characters that don't fit the encoding scheme.
      //    (should we have zero tolerance for these?)
      //
      if (doubleByteCharCount < 20 * badCharCount) {
        confidence = 0;
        break detectBlock;
      }

      if (this.commonChars == null) {
        // We have no statistics on frequently occurring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better
        confidence = 30 + doubleByteCharCount - 20 * badCharCount;
        if (confidence > 100) {
          confidence = 100;
        }
      } else {
        // Frequency of occurrence statistics exist.
        const maxVal = Math.log(doubleByteCharCount / 4);
        const scaleFactor = 90.0 / maxVal;
        confidence = Math.floor(
          Math.log(commonCharCount + 1) * scaleFactor + 10
        );
        confidence = Math.min(confidence, 100);
      }
    } // end of detectBlock:

    return confidence == 0 ? null : match(det, this, confidence);
  }

  /**
   * Get the next character (however many bytes it is) from the input data
   *    Subclasses for specific charset encodings must implement this function
   *    to get characters according to the rules of their encoding scheme.
   *
   *  This function is not a method of class iteratedChar only because
   *   that would require a lot of extra derived classes, which is awkward.
   * @param it  The iteratedChar 'struct' into which the returned char is placed.
   * @param det The charset detector, which is needed to get at the input byte data
   *            being iterated over.
   * @return    True if a character was returned, false at end of input.
   */
  nextChar(iter: IteratedChar, det: Context): boolean {
    return true;
  }
}

/**
 * Shift_JIS charset recognizer.
 */
export class sjis extends mbcs {
  name() {
    return 'Shift_JIS';
  }
  language() {
    return 'ja';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0x8140,
    0x8141,
    0x8142,
    0x8145,
    0x815b,
    0x8169,
    0x816a,
    0x8175,
    0x8176,
    0x82a0,
    0x82a2,
    0x82a4,
    0x82a9,
    0x82aa,
    0x82ab,
    0x82ad,
    0x82af,
    0x82b1,
    0x82b3,
    0x82b5,
    0x82b7,
    0x82bd,
    0x82be,
    0x82c1,
    0x82c4,
    0x82c5,
    0x82c6,
    0x82c8,
    0x82c9,
    0x82cc,
    0x82cd,
    0x82dc,
    0x82e0,
    0x82e7,
    0x82e8,
    0x82e9,
    0x82ea,
    0x82f0,
    0x82f1,
    0x8341,
    0x8343,
    0x834e,
    0x834f,
    0x8358,
    0x835e,
    0x8362,
    0x8367,
    0x8375,
    0x8376,
    0x8389,
    0x838a,
    0x838b,
    0x838d,
    0x8393,
    0x8e96,
    0x93fa,
    0x95aa,
  ];

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;

    const firstByte = (iter.charValue = iter.nextByte(det));
    if (firstByte < 0) return false;

    if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
      return true;

    const secondByte = iter.nextByte(det);
    if (secondByte < 0) return false;

    iter.charValue = (firstByte << 8) | secondByte;
    if (
      !(
        (secondByte >= 0x40 && secondByte <= 0x7f) ||
        (secondByte >= 0x80 && secondByte <= 0xff)
      )
    ) {
      // Illegal second byte value.
      iter.error = true;
    }
    return true;
  }
}

/**
 *   Big5 charset recognizer.
 */
export class big5 extends mbcs {
  name() {
    return 'Big5';
  }
  language() {
    return 'zh';
  }
  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa140,
    0xa141,
    0xa142,
    0xa143,
    0xa147,
    0xa149,
    0xa175,
    0xa176,
    0xa440,
    0xa446,
    0xa447,
    0xa448,
    0xa451,
    0xa454,
    0xa457,
    0xa464,
    0xa46a,
    0xa46c,
    0xa477,
    0xa4a3,
    0xa4a4,
    0xa4a7,
    0xa4c1,
    0xa4ce,
    0xa4d1,
    0xa4df,
    0xa4e8,
    0xa4fd,
    0xa540,
    0xa548,
    0xa558,
    0xa569,
    0xa5cd,
    0xa5e7,
    0xa657,
    0xa661,
    0xa662,
    0xa668,
    0xa670,
    0xa6a8,
    0xa6b3,
    0xa6b9,
    0xa6d3,
    0xa6db,
    0xa6e6,
    0xa6f2,
    0xa740,
    0xa751,
    0xa759,
    0xa7da,
    0xa8a3,
    0xa8a5,
    0xa8ad,
    0xa8d1,
    0xa8d3,
    0xa8e4,
    0xa8fc,
    0xa9c0,
    0xa9d2,
    0xa9f3,
    0xaa6b,
    0xaaba,
    0xaabe,
    0xaacc,
    0xaafc,
    0xac47,
    0xac4f,
    0xacb0,
    0xacd2,
    0xad59,
    0xaec9,
    0xafe0,
    0xb0ea,
    0xb16f,
    0xb2b3,
    0xb2c4,
    0xb36f,
    0xb44c,
    0xb44e,
    0xb54c,
    0xb5a5,
    0xb5bd,
    0xb5d0,
    0xb5d8,
    0xb671,
    0xb7ed,
    0xb867,
    0xb944,
    0xbad8,
    0xbb44,
    0xbba1,
    0xbdd1,
    0xc2c4,
    0xc3b9,
    0xc440,
    0xc45f,
  ];

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;

    const firstByte = (iter.charValue = iter.nextByte(det));

    if (firstByte < 0) return false;

    // single byte character.
    if (firstByte <= 0x7f || firstByte == 0xff) return true;

    const secondByte = iter.nextByte(det);

    if (secondByte < 0) return false;

    iter.charValue = (iter.charValue << 8) | secondByte;

    if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
      iter.error = true;

    return true;
  }
}

/**
 *  EUC charset recognizers.  One abstract class that provides the common function
 *  for getting the next character according to the EUC encoding scheme,
 *  and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 *
 *  Get the next character value for EUC based encodings.
 *  Character 'value' is simply the raw bytes that make up the character
 *     packed into an int.
 */
function eucNextChar(iter: IteratedChar, det: Context) {
  iter.index = iter.nextIndex;
  iter.error = false;
  let firstByte = 0;
  let secondByte = 0;
  let thirdByte = 0;
  //int fourthByte = 0;
  buildChar: {
    firstByte = iter.charValue = iter.nextByte(det);
    if (firstByte < 0) {
      // Ran off the end of the input data
      iter.done = true;
      break buildChar;
    }
    if (firstByte <= 0x8d) {
      // single byte char
      break buildChar;
    }
    secondByte = iter.nextByte(det);
    iter.charValue = (iter.charValue << 8) | secondByte;
    if (firstByte >= 0xa1 && firstByte <= 0xfe) {
      // Two byte Char
      if (secondByte < 0xa1) {
        iter.error = true;
      }
      break buildChar;
    }
    if (firstByte == 0x8e) {
      // Code Set 2.
      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
      // We don't know which we've got.
      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
      //   bytes will look like a well formed 2 byte char.
      if (secondByte < 0xa1) {
        iter.error = true;
      }
      break buildChar;
    }
    if (firstByte == 0x8f) {
      // Code set 3.
      // Three byte total char size, two bytes of actual char value.
      thirdByte = iter.nextByte(det);
      iter.charValue = (iter.charValue << 8) | thirdByte;
      if (thirdByte < 0xa1) {
        iter.error = true;
      }
    }
  }
  return iter.done == false;
}

/**
 * The charset recognize for EUC-JP.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
export class euc_jp extends mbcs {
  name() {
    return 'EUC-JP';
  }
  language() {
    return 'ja';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa1a1,
    0xa1a2,
    0xa1a3,
    0xa1a6,
    0xa1bc,
    0xa1ca,
    0xa1cb,
    0xa1d6,
    0xa1d7,
    0xa4a2,
    0xa4a4,
    0xa4a6,
    0xa4a8,
    0xa4aa,
    0xa4ab,
    0xa4ac,
    0xa4ad,
    0xa4af,
    0xa4b1,
    0xa4b3,
    0xa4b5,
    0xa4b7,
    0xa4b9,
    0xa4bb,
    0xa4bd,
    0xa4bf,
    0xa4c0,
    0xa4c1,
    0xa4c3,
    0xa4c4,
    0xa4c6,
    0xa4c7,
    0xa4c8,
    0xa4c9,
    0xa4ca,
    0xa4cb,
    0xa4ce,
    0xa4cf,
    0xa4d0,
    0xa4de,
    0xa4df,
    0xa4e1,
    0xa4e2,
    0xa4e4,
    0xa4e8,
    0xa4e9,
    0xa4ea,
    0xa4eb,
    0xa4ec,
    0xa4ef,
    0xa4f2,
    0xa4f3,
    0xa5a2,
    0xa5a3,
    0xa5a4,
    0xa5a6,
    0xa5a7,
    0xa5aa,
    0xa5ad,
    0xa5af,
    0xa5b0,
    0xa5b3,
    0xa5b5,
    0xa5b7,
    0xa5b8,
    0xa5b9,
    0xa5bf,
    0xa5c3,
    0xa5c6,
    0xa5c7,
    0xa5c8,
    0xa5c9,
    0xa5cb,
    0xa5d0,
    0xa5d5,
    0xa5d6,
    0xa5d7,
    0xa5de,
    0xa5e0,
    0xa5e1,
    0xa5e5,
    0xa5e9,
    0xa5ea,
    0xa5eb,
    0xa5ec,
    0xa5ed,
    0xa5f3,
    0xb8a9,
    0xb9d4,
    0xbaee,
    0xbbc8,
    0xbef0,
    0xbfb7,
    0xc4ea,
    0xc6fc,
    0xc7bd,
    0xcab8,
    0xcaf3,
    0xcbdc,
    0xcdd1,
  ];

  nextChar = eucNextChar;
}

/**
 * The charset recognize for EUC-KR.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
export class euc_kr extends mbcs {
  name() {
    return 'EUC-KR';
  }

  language() {
    return 'ko';
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xb0a1,
    0xb0b3,
    0xb0c5,
    0xb0cd,
    0xb0d4,
    0xb0e6,
    0xb0ed,
    0xb0f8,
    0xb0fa,
    0xb0fc,
    0xb1b8,
    0xb1b9,
    0xb1c7,
    0xb1d7,
    0xb1e2,
    0xb3aa,
    0xb3bb,
    0xb4c2,
    0xb4cf,
    0xb4d9,
    0xb4eb,
    0xb5a5,
    0xb5b5,
    0xb5bf,
    0xb5c7,
    0xb5e9,
    0xb6f3,
    0xb7af,
    0xb7c2,
    0xb7ce,
    0xb8a6,
    0xb8ae,
    0xb8b6,
    0xb8b8,
    0xb8bb,
    0xb8e9,
    0xb9ab,
    0xb9ae,
    0xb9cc,
    0xb9ce,
    0xb9fd,
    0xbab8,
    0xbace,
    0xbad0,
    0xbaf1,
    0xbbe7,
    0xbbf3,
    0xbbfd,
    0xbcad,
    0xbcba,
    0xbcd2,
    0xbcf6,
    0xbdba,
    0xbdc0,
    0xbdc3,
    0xbdc5,
    0xbec6,
    0xbec8,
    0xbedf,
    0xbeee,
    0xbef8,
    0xbefa,
    0xbfa1,
    0xbfa9,
    0xbfc0,
    0xbfe4,
    0xbfeb,
    0xbfec,
    0xbff8,
    0xc0a7,
    0xc0af,
    0xc0b8,
    0xc0ba,
    0xc0bb,
    0xc0bd,
    0xc0c7,
    0xc0cc,
    0xc0ce,
    0xc0cf,
    0xc0d6,
    0xc0da,
    0xc0e5,
    0xc0fb,
    0xc0fc,
    0xc1a4,
    0xc1a6,
    0xc1b6,
    0xc1d6,
    0xc1df,
    0xc1f6,
    0xc1f8,
    0xc4a1,
    0xc5cd,
    0xc6ae,
    0xc7cf,
    0xc7d1,
    0xc7d2,
    0xc7d8,
    0xc7e5,
    0xc8ad,
  ];

  nextChar = eucNextChar;
}

/**
 *   GB-18030 recognizer. Uses simplified Chinese statistics.
 */
export class gb_18030 extends mbcs {
  name() {
    return 'GB18030';
  }

  language() {
    return 'zh';
  }

  /*
   *  Get the next character value for EUC based encodings.
   *  Character 'value' is simply the raw bytes that make up the character
   *     packed into an int.
   */

  nextChar(iter: IteratedChar, det: Context) {
    iter.index = iter.nextIndex;
    iter.error = false;
    let firstByte = 0;
    let secondByte = 0;
    let thirdByte = 0;
    let fourthByte = 0;
    buildChar: {
      firstByte = iter.charValue = iter.nextByte(det);
      if (firstByte < 0) {
        // Ran off the end of the input data
        iter.done = true;
        break buildChar;
      }
      if (firstByte <= 0x80) {
        // single byte char
        break buildChar;
      }
      secondByte = iter.nextByte(det);
      iter.charValue = (iter.charValue << 8) | secondByte;
      if (firstByte >= 0x81 && firstByte <= 0xfe) {
        // Two byte Char
        if (
          (secondByte >= 0x40 && secondByte <= 0x7e) ||
          (secondByte >= 80 && secondByte <= 0xfe)
        ) {
          break buildChar;
        }
        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
          thirdByte = iter.nextByte(det);
          if (thirdByte >= 0x81 && thirdByte <= 0xfe) {
            fourthByte = iter.nextByte(det);
            if (fourthByte >= 0x30 && fourthByte <= 0x39) {
              iter.charValue =
                (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
              break buildChar;
            }
          }
        }
        iter.error = true;
        break buildChar;
      }
    }
    return iter.done == false;
  }

  // TODO:  This set of data comes from the character frequency-
  //        of-occurrence analysis tool.  The data needs to be moved
  //        into a resource and loaded from there.
  commonChars = [
    0xa1a1,
    0xa1a2,
    0xa1a3,
    0xa1a4,
    0xa1b0,
    0xa1b1,
    0xa1f1,
    0xa1f3,
    0xa3a1,
    0xa3ac,
    0xa3ba,
    0xb1a8,
    0xb1b8,
    0xb1be,
    0xb2bb,
    0xb3c9,
    0xb3f6,
    0xb4f3,
    0xb5bd,
    0xb5c4,
    0xb5e3,
    0xb6af,
    0xb6d4,
    0xb6e0,
    0xb7a2,
    0xb7a8,
    0xb7bd,
    0xb7d6,
    0xb7dd,
    0xb8b4,
    0xb8df,
    0xb8f6,
    0xb9ab,
    0xb9c9,
    0xb9d8,
    0xb9fa,
    0xb9fd,
    0xbacd,
    0xbba7,
    0xbbd6,
    0xbbe1,
    0xbbfa,
    0xbcbc,
    0xbcdb,
    0xbcfe,
    0xbdcc,
    0xbecd,
    0xbedd,
    0xbfb4,
    0xbfc6,
    0xbfc9,
    0xc0b4,
    0xc0ed,
    0xc1cb,
    0xc2db,
    0xc3c7,
    0xc4dc,
    0xc4ea,
    0xc5cc,
    0xc6f7,
    0xc7f8,
    0xc8ab,
    0xc8cb,
    0xc8d5,
    0xc8e7,
    0xc9cf,
    0xc9fa,
    0xcab1,
    0xcab5,
    0xcac7,
    0xcad0,
    0xcad6,
    0xcaf5,
    0xcafd,
    0xccec,
    0xcdf8,
    0xceaa,
    0xcec4,
    0xced2,
    0xcee5,
    0xcfb5,
    0xcfc2,
    0xcfd6,
    0xd0c2,
    0xd0c5,
    0xd0d0,
    0xd0d4,
    0xd1a7,
    0xd2aa,
    0xd2b2,
    0xd2b5,
    0xd2bb,
    0xd2d4,
    0xd3c3,
    0xd3d0,
    0xd3fd,
    0xd4c2,
    0xd4da,
    0xd5e2,
    0xd6d0,
  ];
}