filer/lib/encoding.js

594 lines
17 KiB
JavaScript

/*!
* Shim implementation of the TextEncoder, TextDecoder spec:
* http://encoding.spec.whatwg.org/#interface-textencoder
*
* http://code.google.com/p/stringencoding/source/browse/encoding.js
* 09b44d71759d on Sep 19, 2013
* Used under Apache License 2.0 - http://code.google.com/p/stringencoding/
*
* Filer: modified to remove non-utf8 aspects, converted to CommonJS
*/
(function(global) {
'use strict';
//
// Utilities
//
/**
* @param {number} a The number to test.
* @param {number} min The minimum value in the range, inclusive.
* @param {number} max The maximum value in the range, inclusive.
* @return {boolean} True if a >= min and a <= max.
*/
function inRange(a, min, max) {
return min <= a && a <= max;
}
/**
* @param {number} n The numerator.
* @param {number} d The denominator.
* @return {number} The result of the integer division of n by d.
*/
function div(n, d) {
return Math.floor(n / d);
}
//
// Implementation of Encoding specification
// http://dvcs.w3.org/hg/encoding/raw-file/tip/Overview.html
//
//
// 3. Terminology
//
//
// 4. Encodings
//
/** @const */ var EOF_byte = -1;
/** @const */ var EOF_code_point = -1;
/**
* @constructor
* @param {Uint8Array} bytes Array of bytes that provide the stream.
*/
function ByteInputStream(bytes) {
/** @type {number} */
var pos = 0;
/** @return {number} Get the next byte from the stream. */
this.get = function() {
return (pos >= bytes.length) ? EOF_byte : Number(bytes[pos]);
};
/** @param {number} n Number (positive or negative) by which to
* offset the byte pointer. */
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
}
if (pos > bytes.length) {
throw new Error('Seeking past EOF');
}
};
/**
* @param {Array.<number>} test Array of bytes to compare against.
* @return {boolean} True if the start of the stream matches the test
* bytes.
*/
this.match = function(test) {
if (test.length > pos + bytes.length) {
return false;
}
var i;
for (i = 0; i < test.length; i += 1) {
if (Number(bytes[pos + i]) !== test[i]) {
return false;
}
}
return true;
};
}
/**
* @constructor
* @param {Array.<number>} bytes The array to write bytes into.
*/
function ByteOutputStream(bytes) {
/** @type {number} */
var pos = 0;
/**
* @param {...number} var_args The byte or bytes to emit into the stream.
* @return {number} The last byte emitted.
*/
this.emit = function(var_args) {
/** @type {number} */
var last = EOF_byte;
var i;
for (i = 0; i < arguments.length; ++i) {
last = Number(arguments[i]);
bytes[pos++] = last;
}
return last;
};
}
/**
* @constructor
* @param {string} string The source of code units for the stream.
*/
function CodePointInputStream(string) {
/**
* @param {string} string Input string of UTF-16 code units.
* @return {Array.<number>} Code points.
*/
function stringToCodePoints(string) {
/** @type {Array.<number>} */
var cps = [];
// Based on http://www.w3.org/TR/WebIDL/#idl-DOMString
var i = 0, n = string.length;
while (i < string.length) {
var c = string.charCodeAt(i);
if (!inRange(c, 0xD800, 0xDFFF)) {
cps.push(c);
} else if (inRange(c, 0xDC00, 0xDFFF)) {
cps.push(0xFFFD);
} else { // (inRange(cu, 0xD800, 0xDBFF))
if (i === n - 1) {
cps.push(0xFFFD);
} else {
var d = string.charCodeAt(i + 1);
if (inRange(d, 0xDC00, 0xDFFF)) {
var a = c & 0x3FF;
var b = d & 0x3FF;
i += 1;
cps.push(0x10000 + (a << 10) + b);
} else {
cps.push(0xFFFD);
}
}
}
i += 1;
}
return cps;
}
/** @type {number} */
var pos = 0;
/** @type {Array.<number>} */
var cps = stringToCodePoints(string);
/** @param {number} n The number of bytes (positive or negative)
* to advance the code point pointer by.*/
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
}
if (pos > cps.length) {
throw new Error('Seeking past EOF');
}
};
/** @return {number} Get the next code point from the stream. */
this.get = function() {
if (pos >= cps.length) {
return EOF_code_point;
}
return cps[pos];
};
}
/**
* @constructor
*/
function CodePointOutputStream() {
/** @type {string} */
var string = '';
/** @return {string} The accumulated string. */
this.string = function() {
return string;
};
/** @param {number} c The code point to encode into the stream. */
this.emit = function(c) {
if (c <= 0xFFFF) {
string += String.fromCharCode(c);
} else {
c -= 0x10000;
string += String.fromCharCode(0xD800 + ((c >> 10) & 0x3ff));
string += String.fromCharCode(0xDC00 + (c & 0x3ff));
}
};
}
/**
* @constructor
* @param {string} message Description of the error.
*/
function EncodingError(message) {
this.name = 'EncodingError';
this.message = message;
this.code = 0;
}
EncodingError.prototype = Error.prototype;
/**
* @param {boolean} fatal If true, decoding errors raise an exception.
* @param {number=} opt_code_point Override the standard fallback code point.
* @return {number} The code point to insert on a decoding error.
*/
function decoderError(fatal, opt_code_point) {
if (fatal) {
throw new EncodingError('Decoder error');
}
return opt_code_point || 0xFFFD;
}
/**
* @param {number} code_point The code point that could not be encoded.
*/
function encoderError(code_point) {
throw new EncodingError('The code point ' + code_point +
' could not be encoded.');
}
/**
* @param {string} label The encoding label.
* @return {?{name:string,labels:Array.<string>}}
*/
function getEncoding(label) {
label = String(label).trim().toLowerCase();
if (Object.prototype.hasOwnProperty.call(label_to_encoding, label)) {
return label_to_encoding[label];
}
return null;
}
/** @type {Array.<{encodings: Array.<{name:string,labels:Array.<string>}>,
* heading: string}>} */
var encodings = [
{
"encodings": [
{
"labels": [
"unicode-1-1-utf-8",
"utf-8",
"utf8"
],
"name": "utf-8"
}
],
"heading": "The Encoding"
}
// XXXfiler - removed non-utf8 aspects
];
var name_to_encoding = {};
var label_to_encoding = {};
encodings.forEach(function(category) {
category.encodings.forEach(function(encoding) {
name_to_encoding[encoding.name] = encoding;
encoding.labels.forEach(function(label) {
label_to_encoding[label] = encoding;
});
});
});
//
// 7. The encoding
//
// 7.1 utf-8
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function UTF8Decoder(options) {
var fatal = options.fatal;
var /** @type {number} */ utf8_code_point = 0,
/** @type {number} */ utf8_bytes_needed = 0,
/** @type {number} */ utf8_bytes_seen = 0,
/** @type {number} */ utf8_lower_boundary = 0;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
if (utf8_bytes_needed !== 0) {
return decoderError(fatal);
}
return EOF_code_point;
}
byte_pointer.offset(1);
if (utf8_bytes_needed === 0) {
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (inRange(bite, 0xC2, 0xDF)) {
utf8_bytes_needed = 1;
utf8_lower_boundary = 0x80;
utf8_code_point = bite - 0xC0;
} else if (inRange(bite, 0xE0, 0xEF)) {
utf8_bytes_needed = 2;
utf8_lower_boundary = 0x800;
utf8_code_point = bite - 0xE0;
} else if (inRange(bite, 0xF0, 0xF4)) {
utf8_bytes_needed = 3;
utf8_lower_boundary = 0x10000;
utf8_code_point = bite - 0xF0;
} else {
return decoderError(fatal);
}
utf8_code_point = utf8_code_point * Math.pow(64, utf8_bytes_needed);
return null;
}
if (!inRange(bite, 0x80, 0xBF)) {
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
byte_pointer.offset(-1);
return decoderError(fatal);
}
utf8_bytes_seen += 1;
utf8_code_point = utf8_code_point + (bite - 0x80) *
Math.pow(64, utf8_bytes_needed - utf8_bytes_seen);
if (utf8_bytes_seen !== utf8_bytes_needed) {
return null;
}
var code_point = utf8_code_point;
var lower_boundary = utf8_lower_boundary;
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
if (inRange(code_point, lower_boundary, 0x10FFFF) &&
!inRange(code_point, 0xD800, 0xDFFF)) {
return code_point;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function UTF8Encoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0xD800, 0xDFFF)) {
return encoderError(code_point);
}
if (inRange(code_point, 0x0000, 0x007f)) {
return output_byte_stream.emit(code_point);
}
var count, offset;
if (inRange(code_point, 0x0080, 0x07FF)) {
count = 1;
offset = 0xC0;
} else if (inRange(code_point, 0x0800, 0xFFFF)) {
count = 2;
offset = 0xE0;
} else if (inRange(code_point, 0x10000, 0x10FFFF)) {
count = 3;
offset = 0xF0;
}
var result = output_byte_stream.emit(
div(code_point, Math.pow(64, count)) + offset);
while (count > 0) {
var temp = div(code_point, Math.pow(64, count - 1));
result = output_byte_stream.emit(0x80 + (temp % 64));
count -= 1;
}
return result;
};
}
name_to_encoding['utf-8'].getEncoder = function(options) {
return new UTF8Encoder(options);
};
name_to_encoding['utf-8'].getDecoder = function(options) {
return new UTF8Decoder(options);
};
//
// Implementation of Text Encoding Web API
//
/** @const */ var DEFAULT_ENCODING = 'utf-8';
/**
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {{fatal: boolean}=} options
*/
function TextEncoder(opt_encoding, options) {
if (!(this instanceof TextEncoder)) {
throw new TypeError('Constructor cannot be called as a function');
}
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
if (this._encoding === null || (this._encoding.name !== 'utf-8' &&
this._encoding.name !== 'utf-16le' &&
this._encoding.name !== 'utf-16be'))
throw new TypeError('Unknown encoding: ' + opt_encoding);
/** @private @type {boolean} */
this._streaming = false;
/** @private */
this._encoder = null;
/** @private @type {{fatal: boolean}=} */
this._options = { fatal: Boolean(options.fatal) };
if (Object.defineProperty) {
Object.defineProperty(
this, 'encoding',
{ get: function() { return this._encoding.name; } });
} else {
this.encoding = this._encoding.name;
}
return this;
}
TextEncoder.prototype = {
/**
* @param {string=} opt_string The string to encode.
* @param {{stream: boolean}=} options
*/
encode: function encode(opt_string, options) {
opt_string = opt_string ? String(opt_string) : '';
options = Object(options);
// TODO: any options?
if (!this._streaming) {
this._encoder = this._encoding.getEncoder(this._options);
}
this._streaming = Boolean(options.stream);
var bytes = [];
var output_stream = new ByteOutputStream(bytes);
var input_stream = new CodePointInputStream(opt_string);
while (input_stream.get() !== EOF_code_point) {
this._encoder.encode(output_stream, input_stream);
}
if (!this._streaming) {
var last_byte;
do {
last_byte = this._encoder.encode(output_stream, input_stream);
} while (last_byte !== EOF_byte);
this._encoder = null;
}
return new Uint8Array(bytes);
}
};
/**
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {{fatal: boolean}=} options
*/
function TextDecoder(opt_encoding, options) {
if (!(this instanceof TextDecoder)) {
throw new TypeError('Constructor cannot be called as a function');
}
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
if (this._encoding === null)
throw new TypeError('Unknown encoding: ' + opt_encoding);
/** @private @type {boolean} */
this._streaming = false;
/** @private */
this._decoder = null;
/** @private @type {{fatal: boolean}=} */
this._options = { fatal: Boolean(options.fatal) };
if (Object.defineProperty) {
Object.defineProperty(
this, 'encoding',
{ get: function() { return this._encoding.name; } });
} else {
this.encoding = this._encoding.name;
}
return this;
}
// TODO: Issue if input byte stream is offset by decoder
// TODO: BOM detection will not work if stream header spans multiple calls
// (last N bytes of previous stream may need to be retained?)
TextDecoder.prototype = {
/**
* @param {ArrayBufferView=} opt_view The buffer of bytes to decode.
* @param {{stream: boolean}=} options
*/
decode: function decode(opt_view, options) {
if (opt_view && !('buffer' in opt_view && 'byteOffset' in opt_view &&
'byteLength' in opt_view)) {
throw new TypeError('Expected ArrayBufferView');
} else if (!opt_view) {
opt_view = new Uint8Array(0);
}
options = Object(options);
if (!this._streaming) {
this._decoder = this._encoding.getDecoder(this._options);
this._BOMseen = false;
}
this._streaming = Boolean(options.stream);
var bytes = new Uint8Array(opt_view.buffer,
opt_view.byteOffset,
opt_view.byteLength);
var input_stream = new ByteInputStream(bytes);
var output_stream = new CodePointOutputStream(), code_point;
while (input_stream.get() !== EOF_byte) {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
output_stream.emit(code_point);
}
}
if (!this._streaming) {
do {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
output_stream.emit(code_point);
}
} while (code_point !== EOF_code_point &&
input_stream.get() != EOF_byte);
this._decoder = null;
}
var result = output_stream.string();
if (!this._BOMseen && result.length) {
this._BOMseen = true;
if (['utf-8', 'utf-16le', 'utf-16be'].indexOf(this.encoding) !== -1 &&
result.charCodeAt(0) === 0xFEFF) {
result = result.substring(1);
}
}
return result;
}
};
// Prefer native impl if available
module.exports = {
TextEncoder: global['TextEncoder'] || TextEncoder,
TextDecoder: global['TextDecoder'] || TextDecoder
};
}(this));