cleanup
This commit is contained in:
parent
199a24bad6
commit
9fd949edf3
|
@ -27,8 +27,8 @@ function binarySearch(arr, searchValue) {
|
||||||
return find(arr, searchValue, 0, arr.length - 1);
|
return find(arr, searchValue, 0, arr.length - 1);
|
||||||
};
|
};
|
||||||
|
|
||||||
// "Character" iterated character class.
|
// 'Character' iterated character class.
|
||||||
// Recognizers for specific mbcs encodings make their "characters" available
|
// Recognizers for specific mbcs encodings make their 'characters' available
|
||||||
// by providing a nextChar() function that fills in an instance of iteratedChar
|
// by providing a nextChar() function that fills in an instance of iteratedChar
|
||||||
// with the next char from the input.
|
// with the next char from the input.
|
||||||
// The returned characters are not converted to Unicode, but remain as the raw
|
// The returned characters are not converted to Unicode, but remain as the raw
|
||||||
|
@ -116,7 +116,7 @@ mbcs.prototype.match = function(det) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
|
if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
|
||||||
// console.log('its here!')
|
// console.log('its here!')
|
||||||
// Bail out early if the byte data is not matching the encoding scheme.
|
// Bail out early if the byte data is not matching the encoding scheme.
|
||||||
break detectBlock;
|
break detectBlock;
|
||||||
|
@ -143,7 +143,7 @@ mbcs.prototype.match = function(det) {
|
||||||
// No match if there are too many characters that don't fit the encoding scheme.
|
// No match if there are too many characters that don't fit the encoding scheme.
|
||||||
// (should we have zero tolerance for these?)
|
// (should we have zero tolerance for these?)
|
||||||
//
|
//
|
||||||
if (doubleByteCharCount < 20*badCharCount) {
|
if (doubleByteCharCount < 20 * badCharCount) {
|
||||||
confidence = 0;
|
confidence = 0;
|
||||||
break detectBlock;
|
break detectBlock;
|
||||||
}
|
}
|
||||||
|
@ -152,7 +152,7 @@ mbcs.prototype.match = function(det) {
|
||||||
// We have no statistics on frequently occuring characters.
|
// We have no statistics on frequently occuring characters.
|
||||||
// Assess confidence purely on having a reasonable number of
|
// Assess confidence purely on having a reasonable number of
|
||||||
// multi-byte characters (the more the better
|
// multi-byte characters (the more the better
|
||||||
confidence = 30 + doubleByteCharCount - 20*badCharCount;
|
confidence = 30 + doubleByteCharCount - 20 * badCharCount;
|
||||||
if (confidence > 100) {
|
if (confidence > 100) {
|
||||||
confidence = 100;
|
confidence = 100;
|
||||||
}
|
}
|
||||||
|
@ -162,7 +162,7 @@ mbcs.prototype.match = function(det) {
|
||||||
//
|
//
|
||||||
var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
|
var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
|
||||||
var scaleFactor = 90.0 / maxVal;
|
var scaleFactor = 90.0 / maxVal;
|
||||||
confidence = Math.floor(Math.log(commonCharCount+1) * scaleFactor + 10);
|
confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
|
||||||
confidence = Math.min(confidence, 100);
|
confidence = Math.min(confidence, 100);
|
||||||
}
|
}
|
||||||
} // end of detectBlock:
|
} // end of detectBlock:
|
||||||
|
@ -177,7 +177,7 @@ mbcs.prototype.match = function(det) {
|
||||||
*
|
*
|
||||||
* This function is not a method of class iteratedChar only because
|
* This function is not a method of class iteratedChar only because
|
||||||
* that would require a lot of extra derived classes, which is awkward.
|
* that would require a lot of extra derived classes, which is awkward.
|
||||||
* @param it The iteratedChar "struct" into which the returned char is placed.
|
* @param it The iteratedChar 'struct' into which the returned char is placed.
|
||||||
* @param det The charset detector, which is needed to get at the input byte data
|
* @param det The charset detector, which is needed to get at the input byte data
|
||||||
* being iterated over.
|
* being iterated over.
|
||||||
* @return True if a character was returned, false at end of input.
|
* @return True if a character was returned, false at end of input.
|
||||||
|
@ -192,10 +192,10 @@ mbcs.prototype.nextChar = function(iter, det) {};
|
||||||
*/
|
*/
|
||||||
module.exports.sjis = function() {
|
module.exports.sjis = function() {
|
||||||
this.name = function() {
|
this.name = function() {
|
||||||
return "Shift-JIS";
|
return 'Shift-JIS';
|
||||||
};
|
};
|
||||||
this.language = function() {
|
this.language = function() {
|
||||||
return "ja";
|
return 'ja';
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
|
@ -216,20 +216,18 @@ module.exports.sjis = function() {
|
||||||
|
|
||||||
var firstByte;
|
var firstByte;
|
||||||
firstByte = iter.charValue = iter.nextByte(det);
|
firstByte = iter.charValue = iter.nextByte(det);
|
||||||
if (firstByte < 0) {
|
if (firstByte < 0)
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
|
if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
|
||||||
return true;
|
return true;
|
||||||
}
|
|
||||||
|
|
||||||
var secondByte = iter.nextByte(det);
|
var secondByte = iter.nextByte(det);
|
||||||
if (secondByte < 0) {
|
if (secondByte < 0)
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
iter.charValue = (firstByte << 8) | secondByte;
|
iter.charValue = (firstByte << 8) | secondByte;
|
||||||
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
|
if (! ((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
|
||||||
// Illegal second byte value.
|
// Illegal second byte value.
|
||||||
iter.error = true;
|
iter.error = true;
|
||||||
}
|
}
|
||||||
|
@ -245,10 +243,10 @@ util.inherits(module.exports.sjis, mbcs);
|
||||||
*/
|
*/
|
||||||
module.exports.big5 = function() {
|
module.exports.big5 = function() {
|
||||||
this.name = function() {
|
this.name = function() {
|
||||||
return "Big5";
|
return 'Big5';
|
||||||
};
|
};
|
||||||
this.language = function() {
|
this.language = function() {
|
||||||
return "zh";
|
return 'zh';
|
||||||
};
|
};
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
// of-occurence analysis tool. The data needs to be moved
|
// of-occurence analysis tool. The data needs to be moved
|
||||||
|
@ -300,7 +298,7 @@ util.inherits(module.exports.big5, mbcs);
|
||||||
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
|
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
|
||||||
*
|
*
|
||||||
* Get the next character value for EUC based encodings.
|
* Get the next character value for EUC based encodings.
|
||||||
* Character "value" is simply the raw bytes that make up the character
|
* Character 'value' is simply the raw bytes that make up the character
|
||||||
* packed into an int.
|
* packed into an int.
|
||||||
*/
|
*/
|
||||||
function eucNextChar(iter, det) {
|
function eucNextChar(iter, det) {
|
||||||
|
@ -363,10 +361,10 @@ function eucNextChar(iter, det) {
|
||||||
*/
|
*/
|
||||||
module.exports.euc_jp = function() {
|
module.exports.euc_jp = function() {
|
||||||
this.name = function() {
|
this.name = function() {
|
||||||
return "EUC-JP";
|
return 'EUC-JP';
|
||||||
};
|
};
|
||||||
this.language = function() {
|
this.language = function() {
|
||||||
return "ja";
|
return 'ja';
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
|
@ -397,10 +395,10 @@ util.inherits(module.exports.euc_jp, mbcs);
|
||||||
*/
|
*/
|
||||||
module.exports.euc_kr = function() {
|
module.exports.euc_kr = function() {
|
||||||
this.name = function() {
|
this.name = function() {
|
||||||
return "EUC-KR";
|
return 'EUC-KR';
|
||||||
};
|
};
|
||||||
this.language = function() {
|
this.language = function() {
|
||||||
return "ko";
|
return 'ko';
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: This set of data comes from the character frequency-
|
// TODO: This set of data comes from the character frequency-
|
||||||
|
@ -430,15 +428,15 @@ util.inherits(module.exports.euc_kr, mbcs);
|
||||||
*/
|
*/
|
||||||
module.exports.gb_18030 = function() {
|
module.exports.gb_18030 = function() {
|
||||||
this.name = function() {
|
this.name = function() {
|
||||||
return "GB18030";
|
return 'GB18030';
|
||||||
};
|
};
|
||||||
this.language = function() {
|
this.language = function() {
|
||||||
return "zh";
|
return 'zh';
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the next character value for EUC based encodings.
|
* Get the next character value for EUC based encodings.
|
||||||
* Character "value" is simply the raw bytes that make up the character
|
* Character 'value' is simply the raw bytes that make up the character
|
||||||
* packed into an int.
|
* packed into an int.
|
||||||
*/
|
*/
|
||||||
this.nextChar = function(iter, det) {
|
this.nextChar = function(iter, det) {
|
||||||
|
@ -463,7 +461,7 @@ module.exports.gb_18030 = function() {
|
||||||
iter.charValue = (iter.charValue << 8) | secondByte;
|
iter.charValue = (iter.charValue << 8) | secondByte;
|
||||||
if (firstByte >= 0x81 && firstByte <= 0xFE) {
|
if (firstByte >= 0x81 && firstByte <= 0xFE) {
|
||||||
// Two byte Char
|
// Two byte Char
|
||||||
if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
|
if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
|
||||||
break buildChar;
|
break buildChar;
|
||||||
}
|
}
|
||||||
// Four byte char
|
// Four byte char
|
||||||
|
|
Loading…
Reference in New Issue