594 lines
17 KiB
JavaScript
594 lines
17 KiB
JavaScript
/*!
|
|
* Shim implementation of the TextEncoder, TextDecoder spec:
|
|
* http://encoding.spec.whatwg.org/#interface-textencoder
|
|
*
|
|
* http://code.google.com/p/stringencoding/source/browse/encoding.js
|
|
* 09b44d71759d on Sep 19, 2013
|
|
* Used under Apache License 2.0 - http://code.google.com/p/stringencoding/
|
|
*
|
|
* Filer: modified to remove non-utf8 aspects, converted to CommonJS
|
|
*/
|
|
(function(global) {
|
|
'use strict';
|
|
|
|
//
|
|
// Utilities
|
|
//
|
|
|
|
/**
|
|
* @param {number} a The number to test.
|
|
* @param {number} min The minimum value in the range, inclusive.
|
|
* @param {number} max The maximum value in the range, inclusive.
|
|
* @return {boolean} True if a >= min and a <= max.
|
|
*/
|
|
function inRange(a, min, max) {
|
|
return min <= a && a <= max;
|
|
}
|
|
|
|
/**
|
|
* @param {number} n The numerator.
|
|
* @param {number} d The denominator.
|
|
* @return {number} The result of the integer division of n by d.
|
|
*/
|
|
function div(n, d) {
|
|
return Math.floor(n / d);
|
|
}
|
|
|
|
|
|
//
|
|
// Implementation of Encoding specification
|
|
// http://dvcs.w3.org/hg/encoding/raw-file/tip/Overview.html
|
|
//
|
|
|
|
//
|
|
// 3. Terminology
|
|
//
|
|
|
|
//
|
|
// 4. Encodings
|
|
//
|
|
|
|
/** @const */ var EOF_byte = -1;
|
|
/** @const */ var EOF_code_point = -1;
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {Uint8Array} bytes Array of bytes that provide the stream.
|
|
*/
|
|
function ByteInputStream(bytes) {
|
|
/** @type {number} */
|
|
var pos = 0;
|
|
|
|
/** @return {number} Get the next byte from the stream. */
|
|
this.get = function() {
|
|
return (pos >= bytes.length) ? EOF_byte : Number(bytes[pos]);
|
|
};
|
|
|
|
/** @param {number} n Number (positive or negative) by which to
|
|
* offset the byte pointer. */
|
|
this.offset = function(n) {
|
|
pos += n;
|
|
if (pos < 0) {
|
|
throw new Error('Seeking past start of the buffer');
|
|
}
|
|
if (pos > bytes.length) {
|
|
throw new Error('Seeking past EOF');
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @param {Array.<number>} test Array of bytes to compare against.
|
|
* @return {boolean} True if the start of the stream matches the test
|
|
* bytes.
|
|
*/
|
|
this.match = function(test) {
|
|
if (test.length > pos + bytes.length) {
|
|
return false;
|
|
}
|
|
var i;
|
|
for (i = 0; i < test.length; i += 1) {
|
|
if (Number(bytes[pos + i]) !== test[i]) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {Array.<number>} bytes The array to write bytes into.
|
|
*/
|
|
function ByteOutputStream(bytes) {
|
|
/** @type {number} */
|
|
var pos = 0;
|
|
|
|
/**
|
|
* @param {...number} var_args The byte or bytes to emit into the stream.
|
|
* @return {number} The last byte emitted.
|
|
*/
|
|
this.emit = function(var_args) {
|
|
/** @type {number} */
|
|
var last = EOF_byte;
|
|
var i;
|
|
for (i = 0; i < arguments.length; ++i) {
|
|
last = Number(arguments[i]);
|
|
bytes[pos++] = last;
|
|
}
|
|
return last;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {string} string The source of code units for the stream.
|
|
*/
|
|
function CodePointInputStream(string) {
|
|
/**
|
|
* @param {string} string Input string of UTF-16 code units.
|
|
* @return {Array.<number>} Code points.
|
|
*/
|
|
function stringToCodePoints(string) {
|
|
/** @type {Array.<number>} */
|
|
var cps = [];
|
|
// Based on http://www.w3.org/TR/WebIDL/#idl-DOMString
|
|
var i = 0, n = string.length;
|
|
while (i < string.length) {
|
|
var c = string.charCodeAt(i);
|
|
if (!inRange(c, 0xD800, 0xDFFF)) {
|
|
cps.push(c);
|
|
} else if (inRange(c, 0xDC00, 0xDFFF)) {
|
|
cps.push(0xFFFD);
|
|
} else { // (inRange(cu, 0xD800, 0xDBFF))
|
|
if (i === n - 1) {
|
|
cps.push(0xFFFD);
|
|
} else {
|
|
var d = string.charCodeAt(i + 1);
|
|
if (inRange(d, 0xDC00, 0xDFFF)) {
|
|
var a = c & 0x3FF;
|
|
var b = d & 0x3FF;
|
|
i += 1;
|
|
cps.push(0x10000 + (a << 10) + b);
|
|
} else {
|
|
cps.push(0xFFFD);
|
|
}
|
|
}
|
|
}
|
|
i += 1;
|
|
}
|
|
return cps;
|
|
}
|
|
|
|
/** @type {number} */
|
|
var pos = 0;
|
|
/** @type {Array.<number>} */
|
|
var cps = stringToCodePoints(string);
|
|
|
|
/** @param {number} n The number of bytes (positive or negative)
|
|
* to advance the code point pointer by.*/
|
|
this.offset = function(n) {
|
|
pos += n;
|
|
if (pos < 0) {
|
|
throw new Error('Seeking past start of the buffer');
|
|
}
|
|
if (pos > cps.length) {
|
|
throw new Error('Seeking past EOF');
|
|
}
|
|
};
|
|
|
|
|
|
/** @return {number} Get the next code point from the stream. */
|
|
this.get = function() {
|
|
if (pos >= cps.length) {
|
|
return EOF_code_point;
|
|
}
|
|
return cps[pos];
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
*/
|
|
function CodePointOutputStream() {
|
|
/** @type {string} */
|
|
var string = '';
|
|
|
|
/** @return {string} The accumulated string. */
|
|
this.string = function() {
|
|
return string;
|
|
};
|
|
|
|
/** @param {number} c The code point to encode into the stream. */
|
|
this.emit = function(c) {
|
|
if (c <= 0xFFFF) {
|
|
string += String.fromCharCode(c);
|
|
} else {
|
|
c -= 0x10000;
|
|
string += String.fromCharCode(0xD800 + ((c >> 10) & 0x3ff));
|
|
string += String.fromCharCode(0xDC00 + (c & 0x3ff));
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {string} message Description of the error.
|
|
*/
|
|
function EncodingError(message) {
|
|
this.name = 'EncodingError';
|
|
this.message = message;
|
|
this.code = 0;
|
|
}
|
|
EncodingError.prototype = Error.prototype;
|
|
|
|
/**
|
|
* @param {boolean} fatal If true, decoding errors raise an exception.
|
|
* @param {number=} opt_code_point Override the standard fallback code point.
|
|
* @return {number} The code point to insert on a decoding error.
|
|
*/
|
|
function decoderError(fatal, opt_code_point) {
|
|
if (fatal) {
|
|
throw new EncodingError('Decoder error');
|
|
}
|
|
return opt_code_point || 0xFFFD;
|
|
}
|
|
|
|
/**
|
|
* @param {number} code_point The code point that could not be encoded.
|
|
*/
|
|
function encoderError(code_point) {
|
|
throw new EncodingError('The code point ' + code_point +
|
|
' could not be encoded.');
|
|
}
|
|
|
|
/**
|
|
* @param {string} label The encoding label.
|
|
* @return {?{name:string,labels:Array.<string>}}
|
|
*/
|
|
function getEncoding(label) {
|
|
label = String(label).trim().toLowerCase();
|
|
if (Object.prototype.hasOwnProperty.call(label_to_encoding, label)) {
|
|
return label_to_encoding[label];
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** @type {Array.<{encodings: Array.<{name:string,labels:Array.<string>}>,
|
|
* heading: string}>} */
|
|
var encodings = [
|
|
{
|
|
"encodings": [
|
|
{
|
|
"labels": [
|
|
"unicode-1-1-utf-8",
|
|
"utf-8",
|
|
"utf8"
|
|
],
|
|
"name": "utf-8"
|
|
}
|
|
],
|
|
"heading": "The Encoding"
|
|
}
|
|
// XXXfiler - removed non-utf8 aspects
|
|
];
|
|
|
|
var name_to_encoding = {};
|
|
var label_to_encoding = {};
|
|
encodings.forEach(function(category) {
|
|
category.encodings.forEach(function(encoding) {
|
|
name_to_encoding[encoding.name] = encoding;
|
|
encoding.labels.forEach(function(label) {
|
|
label_to_encoding[label] = encoding;
|
|
});
|
|
});
|
|
});
|
|
|
|
//
|
|
// 7. The encoding
|
|
//
|
|
|
|
// 7.1 utf-8
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {{fatal: boolean}} options
|
|
*/
|
|
function UTF8Decoder(options) {
|
|
var fatal = options.fatal;
|
|
var /** @type {number} */ utf8_code_point = 0,
|
|
/** @type {number} */ utf8_bytes_needed = 0,
|
|
/** @type {number} */ utf8_bytes_seen = 0,
|
|
/** @type {number} */ utf8_lower_boundary = 0;
|
|
|
|
/**
|
|
* @param {ByteInputStream} byte_pointer The byte stream to decode.
|
|
* @return {?number} The next code point decoded, or null if not enough
|
|
* data exists in the input stream to decode a complete code point.
|
|
*/
|
|
this.decode = function(byte_pointer) {
|
|
var bite = byte_pointer.get();
|
|
if (bite === EOF_byte) {
|
|
if (utf8_bytes_needed !== 0) {
|
|
return decoderError(fatal);
|
|
}
|
|
return EOF_code_point;
|
|
}
|
|
byte_pointer.offset(1);
|
|
|
|
if (utf8_bytes_needed === 0) {
|
|
if (inRange(bite, 0x00, 0x7F)) {
|
|
return bite;
|
|
}
|
|
if (inRange(bite, 0xC2, 0xDF)) {
|
|
utf8_bytes_needed = 1;
|
|
utf8_lower_boundary = 0x80;
|
|
utf8_code_point = bite - 0xC0;
|
|
} else if (inRange(bite, 0xE0, 0xEF)) {
|
|
utf8_bytes_needed = 2;
|
|
utf8_lower_boundary = 0x800;
|
|
utf8_code_point = bite - 0xE0;
|
|
} else if (inRange(bite, 0xF0, 0xF4)) {
|
|
utf8_bytes_needed = 3;
|
|
utf8_lower_boundary = 0x10000;
|
|
utf8_code_point = bite - 0xF0;
|
|
} else {
|
|
return decoderError(fatal);
|
|
}
|
|
utf8_code_point = utf8_code_point * Math.pow(64, utf8_bytes_needed);
|
|
return null;
|
|
}
|
|
if (!inRange(bite, 0x80, 0xBF)) {
|
|
utf8_code_point = 0;
|
|
utf8_bytes_needed = 0;
|
|
utf8_bytes_seen = 0;
|
|
utf8_lower_boundary = 0;
|
|
byte_pointer.offset(-1);
|
|
return decoderError(fatal);
|
|
}
|
|
utf8_bytes_seen += 1;
|
|
utf8_code_point = utf8_code_point + (bite - 0x80) *
|
|
Math.pow(64, utf8_bytes_needed - utf8_bytes_seen);
|
|
if (utf8_bytes_seen !== utf8_bytes_needed) {
|
|
return null;
|
|
}
|
|
var code_point = utf8_code_point;
|
|
var lower_boundary = utf8_lower_boundary;
|
|
utf8_code_point = 0;
|
|
utf8_bytes_needed = 0;
|
|
utf8_bytes_seen = 0;
|
|
utf8_lower_boundary = 0;
|
|
if (inRange(code_point, lower_boundary, 0x10FFFF) &&
|
|
!inRange(code_point, 0xD800, 0xDFFF)) {
|
|
return code_point;
|
|
}
|
|
return decoderError(fatal);
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {{fatal: boolean}} options
|
|
*/
|
|
function UTF8Encoder(options) {
|
|
var fatal = options.fatal;
|
|
/**
|
|
* @param {ByteOutputStream} output_byte_stream Output byte stream.
|
|
* @param {CodePointInputStream} code_point_pointer Input stream.
|
|
* @return {number} The last byte emitted.
|
|
*/
|
|
this.encode = function(output_byte_stream, code_point_pointer) {
|
|
var code_point = code_point_pointer.get();
|
|
if (code_point === EOF_code_point) {
|
|
return EOF_byte;
|
|
}
|
|
code_point_pointer.offset(1);
|
|
if (inRange(code_point, 0xD800, 0xDFFF)) {
|
|
return encoderError(code_point);
|
|
}
|
|
if (inRange(code_point, 0x0000, 0x007f)) {
|
|
return output_byte_stream.emit(code_point);
|
|
}
|
|
var count, offset;
|
|
if (inRange(code_point, 0x0080, 0x07FF)) {
|
|
count = 1;
|
|
offset = 0xC0;
|
|
} else if (inRange(code_point, 0x0800, 0xFFFF)) {
|
|
count = 2;
|
|
offset = 0xE0;
|
|
} else if (inRange(code_point, 0x10000, 0x10FFFF)) {
|
|
count = 3;
|
|
offset = 0xF0;
|
|
}
|
|
var result = output_byte_stream.emit(
|
|
div(code_point, Math.pow(64, count)) + offset);
|
|
while (count > 0) {
|
|
var temp = div(code_point, Math.pow(64, count - 1));
|
|
result = output_byte_stream.emit(0x80 + (temp % 64));
|
|
count -= 1;
|
|
}
|
|
return result;
|
|
};
|
|
}
|
|
|
|
name_to_encoding['utf-8'].getEncoder = function(options) {
|
|
return new UTF8Encoder(options);
|
|
};
|
|
name_to_encoding['utf-8'].getDecoder = function(options) {
|
|
return new UTF8Decoder(options);
|
|
};
|
|
|
|
//
|
|
// Implementation of Text Encoding Web API
|
|
//
|
|
|
|
/** @const */ var DEFAULT_ENCODING = 'utf-8';
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {string=} opt_encoding The label of the encoding;
|
|
* defaults to 'utf-8'.
|
|
* @param {{fatal: boolean}=} options
|
|
*/
|
|
function TextEncoder(opt_encoding, options) {
|
|
if (!(this instanceof TextEncoder)) {
|
|
throw new TypeError('Constructor cannot be called as a function');
|
|
}
|
|
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
|
|
options = Object(options);
|
|
/** @private */
|
|
this._encoding = getEncoding(opt_encoding);
|
|
if (this._encoding === null || (this._encoding.name !== 'utf-8' &&
|
|
this._encoding.name !== 'utf-16le' &&
|
|
this._encoding.name !== 'utf-16be'))
|
|
throw new TypeError('Unknown encoding: ' + opt_encoding);
|
|
/** @private @type {boolean} */
|
|
this._streaming = false;
|
|
/** @private */
|
|
this._encoder = null;
|
|
/** @private @type {{fatal: boolean}=} */
|
|
this._options = { fatal: Boolean(options.fatal) };
|
|
|
|
if (Object.defineProperty) {
|
|
Object.defineProperty(
|
|
this, 'encoding',
|
|
{ get: function() { return this._encoding.name; } });
|
|
} else {
|
|
this.encoding = this._encoding.name;
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
TextEncoder.prototype = {
|
|
/**
|
|
* @param {string=} opt_string The string to encode.
|
|
* @param {{stream: boolean}=} options
|
|
*/
|
|
encode: function encode(opt_string, options) {
|
|
opt_string = opt_string ? String(opt_string) : '';
|
|
options = Object(options);
|
|
// TODO: any options?
|
|
if (!this._streaming) {
|
|
this._encoder = this._encoding.getEncoder(this._options);
|
|
}
|
|
this._streaming = Boolean(options.stream);
|
|
|
|
var bytes = [];
|
|
var output_stream = new ByteOutputStream(bytes);
|
|
var input_stream = new CodePointInputStream(opt_string);
|
|
while (input_stream.get() !== EOF_code_point) {
|
|
this._encoder.encode(output_stream, input_stream);
|
|
}
|
|
if (!this._streaming) {
|
|
var last_byte;
|
|
do {
|
|
last_byte = this._encoder.encode(output_stream, input_stream);
|
|
} while (last_byte !== EOF_byte);
|
|
this._encoder = null;
|
|
}
|
|
return new Uint8Array(bytes);
|
|
}
|
|
};
|
|
|
|
|
|
/**
|
|
* @constructor
|
|
* @param {string=} opt_encoding The label of the encoding;
|
|
* defaults to 'utf-8'.
|
|
* @param {{fatal: boolean}=} options
|
|
*/
|
|
function TextDecoder(opt_encoding, options) {
|
|
if (!(this instanceof TextDecoder)) {
|
|
throw new TypeError('Constructor cannot be called as a function');
|
|
}
|
|
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
|
|
options = Object(options);
|
|
/** @private */
|
|
this._encoding = getEncoding(opt_encoding);
|
|
if (this._encoding === null)
|
|
throw new TypeError('Unknown encoding: ' + opt_encoding);
|
|
|
|
/** @private @type {boolean} */
|
|
this._streaming = false;
|
|
/** @private */
|
|
this._decoder = null;
|
|
/** @private @type {{fatal: boolean}=} */
|
|
this._options = { fatal: Boolean(options.fatal) };
|
|
|
|
if (Object.defineProperty) {
|
|
Object.defineProperty(
|
|
this, 'encoding',
|
|
{ get: function() { return this._encoding.name; } });
|
|
} else {
|
|
this.encoding = this._encoding.name;
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
// TODO: Issue if input byte stream is offset by decoder
|
|
// TODO: BOM detection will not work if stream header spans multiple calls
|
|
// (last N bytes of previous stream may need to be retained?)
|
|
TextDecoder.prototype = {
|
|
/**
|
|
* @param {ArrayBufferView=} opt_view The buffer of bytes to decode.
|
|
* @param {{stream: boolean}=} options
|
|
*/
|
|
decode: function decode(opt_view, options) {
|
|
if (opt_view && !('buffer' in opt_view && 'byteOffset' in opt_view &&
|
|
'byteLength' in opt_view)) {
|
|
throw new TypeError('Expected ArrayBufferView');
|
|
} else if (!opt_view) {
|
|
opt_view = new Uint8Array(0);
|
|
}
|
|
options = Object(options);
|
|
|
|
if (!this._streaming) {
|
|
this._decoder = this._encoding.getDecoder(this._options);
|
|
this._BOMseen = false;
|
|
}
|
|
this._streaming = Boolean(options.stream);
|
|
|
|
var bytes = new Uint8Array(opt_view.buffer,
|
|
opt_view.byteOffset,
|
|
opt_view.byteLength);
|
|
var input_stream = new ByteInputStream(bytes);
|
|
|
|
var output_stream = new CodePointOutputStream(), code_point;
|
|
while (input_stream.get() !== EOF_byte) {
|
|
code_point = this._decoder.decode(input_stream);
|
|
if (code_point !== null && code_point !== EOF_code_point) {
|
|
output_stream.emit(code_point);
|
|
}
|
|
}
|
|
if (!this._streaming) {
|
|
do {
|
|
code_point = this._decoder.decode(input_stream);
|
|
if (code_point !== null && code_point !== EOF_code_point) {
|
|
output_stream.emit(code_point);
|
|
}
|
|
} while (code_point !== EOF_code_point &&
|
|
input_stream.get() != EOF_byte);
|
|
this._decoder = null;
|
|
}
|
|
|
|
var result = output_stream.string();
|
|
if (!this._BOMseen && result.length) {
|
|
this._BOMseen = true;
|
|
if (['utf-8', 'utf-16le', 'utf-16be'].indexOf(this.encoding) !== -1 &&
|
|
result.charCodeAt(0) === 0xFEFF) {
|
|
result = result.substring(1);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
};
|
|
|
|
// Prefer native impl if available
|
|
module.exports = {
|
|
TextEncoder: global['TextEncoder'] || TextEncoder,
|
|
TextDecoder: global['TextDecoder'] || TextDecoder
|
|
};
|
|
}(this));
|