
594 lines
17 KiB
Raw Normal View History

* Shim implementation of the TextEncoder, TextDecoder spec:
* 09b44d71759d on Sep 19, 2013
* Used under Apache License 2.0 -
* Filer: modified to remove non-utf8 aspects, converted to CommonJS
(function(global) {
'use strict';
// Utilities
* @param {number} a The number to test.
* @param {number} min The minimum value in the range, inclusive.
* @param {number} max The maximum value in the range, inclusive.
* @return {boolean} True if a >= min and a <= max.
function inRange(a, min, max) {
return min <= a && a <= max;
* @param {number} n The numerator.
* @param {number} d The denominator.
* @return {number} The result of the integer division of n by d.
function div(n, d) {
return Math.floor(n / d);
// Implementation of Encoding specification
// 3. Terminology
// 4. Encodings
/** @const */ var EOF_byte = -1;
/** @const */ var EOF_code_point = -1;
* @constructor
* @param {Uint8Array} bytes Array of bytes that provide the stream.
function ByteInputStream(bytes) {
/** @type {number} */
var pos = 0;
/** @return {number} Get the next byte from the stream. */
this.get = function() {
return (pos >= bytes.length) ? EOF_byte : Number(bytes[pos]);
/** @param {number} n Number (positive or negative) by which to
* offset the byte pointer. */
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
if (pos > bytes.length) {
throw new Error('Seeking past EOF');
* @param {Array.<number>} test Array of bytes to compare against.
* @return {boolean} True if the start of the stream matches the test
* bytes.
this.match = function(test) {
if (test.length > pos + bytes.length) {
return false;
var i;
for (i = 0; i < test.length; i += 1) {
if (Number(bytes[pos + i]) !== test[i]) {
return false;
return true;
* @constructor
* @param {Array.<number>} bytes The array to write bytes into.
function ByteOutputStream(bytes) {
/** @type {number} */
var pos = 0;
* @param {...number} var_args The byte or bytes to emit into the stream.
* @return {number} The last byte emitted.
this.emit = function(var_args) {
/** @type {number} */
var last = EOF_byte;
var i;
for (i = 0; i < arguments.length; ++i) {
last = Number(arguments[i]);
bytes[pos++] = last;
return last;
* @constructor
* @param {string} string The source of code units for the stream.
function CodePointInputStream(string) {
* @param {string} string Input string of UTF-16 code units.
* @return {Array.<number>} Code points.
function stringToCodePoints(string) {
/** @type {Array.<number>} */
var cps = [];
// Based on
var i = 0, n = string.length;
while (i < string.length) {
var c = string.charCodeAt(i);
if (!inRange(c, 0xD800, 0xDFFF)) {
} else if (inRange(c, 0xDC00, 0xDFFF)) {
} else { // (inRange(cu, 0xD800, 0xDBFF))
if (i === n - 1) {
} else {
var d = string.charCodeAt(i + 1);
if (inRange(d, 0xDC00, 0xDFFF)) {
var a = c & 0x3FF;
var b = d & 0x3FF;
i += 1;
cps.push(0x10000 + (a << 10) + b);
} else {
i += 1;
return cps;
/** @type {number} */
var pos = 0;
/** @type {Array.<number>} */
var cps = stringToCodePoints(string);
/** @param {number} n The number of bytes (positive or negative)
* to advance the code point pointer by.*/
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
if (pos > cps.length) {
throw new Error('Seeking past EOF');
/** @return {number} Get the next code point from the stream. */
this.get = function() {
if (pos >= cps.length) {
return EOF_code_point;
return cps[pos];
* @constructor
function CodePointOutputStream() {
/** @type {string} */
var string = '';
/** @return {string} The accumulated string. */
this.string = function() {
return string;
/** @param {number} c The code point to encode into the stream. */
this.emit = function(c) {
if (c <= 0xFFFF) {
string += String.fromCharCode(c);
} else {
c -= 0x10000;
string += String.fromCharCode(0xD800 + ((c >> 10) & 0x3ff));
string += String.fromCharCode(0xDC00 + (c & 0x3ff));
* @constructor
* @param {string} message Description of the error.
function EncodingError(message) { = 'EncodingError';
this.message = message;
this.code = 0;
EncodingError.prototype = Error.prototype;
* @param {boolean} fatal If true, decoding errors raise an exception.
* @param {number=} opt_code_point Override the standard fallback code point.
* @return {number} The code point to insert on a decoding error.
function decoderError(fatal, opt_code_point) {
if (fatal) {
throw new EncodingError('Decoder error');
return opt_code_point || 0xFFFD;
* @param {number} code_point The code point that could not be encoded.
function encoderError(code_point) {
throw new EncodingError('The code point ' + code_point +
' could not be encoded.');
* @param {string} label The encoding label.
* @return {?{name:string,labels:Array.<string>}}
function getEncoding(label) {
label = String(label).trim().toLowerCase();
if (, label)) {
return label_to_encoding[label];
return null;
/** @type {Array.<{encodings: Array.<{name:string,labels:Array.<string>}>,
* heading: string}>} */
var encodings = [
"encodings": [
"labels": [
"name": "utf-8"
"heading": "The Encoding"
// XXXfiler - removed non-utf8 aspects
var name_to_encoding = {};
var label_to_encoding = {};
encodings.forEach(function(category) {
category.encodings.forEach(function(encoding) {
name_to_encoding[] = encoding;
encoding.labels.forEach(function(label) {
label_to_encoding[label] = encoding;
// 7. The encoding
// 7.1 utf-8
* @constructor
* @param {{fatal: boolean}} options
function UTF8Decoder(options) {
var fatal = options.fatal;
var /** @type {number} */ utf8_code_point = 0,
/** @type {number} */ utf8_bytes_needed = 0,
/** @type {number} */ utf8_bytes_seen = 0,
/** @type {number} */ utf8_lower_boundary = 0;
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
if (utf8_bytes_needed !== 0) {
return decoderError(fatal);
return EOF_code_point;
if (utf8_bytes_needed === 0) {
if (inRange(bite, 0x00, 0x7F)) {
return bite;
if (inRange(bite, 0xC2, 0xDF)) {
utf8_bytes_needed = 1;
utf8_lower_boundary = 0x80;
utf8_code_point = bite - 0xC0;
} else if (inRange(bite, 0xE0, 0xEF)) {
utf8_bytes_needed = 2;
utf8_lower_boundary = 0x800;
utf8_code_point = bite - 0xE0;
} else if (inRange(bite, 0xF0, 0xF4)) {
utf8_bytes_needed = 3;
utf8_lower_boundary = 0x10000;
utf8_code_point = bite - 0xF0;
} else {
return decoderError(fatal);
utf8_code_point = utf8_code_point * Math.pow(64, utf8_bytes_needed);
return null;
if (!inRange(bite, 0x80, 0xBF)) {
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
return decoderError(fatal);
utf8_bytes_seen += 1;
utf8_code_point = utf8_code_point + (bite - 0x80) *
Math.pow(64, utf8_bytes_needed - utf8_bytes_seen);
if (utf8_bytes_seen !== utf8_bytes_needed) {
return null;
var code_point = utf8_code_point;
var lower_boundary = utf8_lower_boundary;
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
if (inRange(code_point, lower_boundary, 0x10FFFF) &&
!inRange(code_point, 0xD800, 0xDFFF)) {
return code_point;
return decoderError(fatal);
* @constructor
* @param {{fatal: boolean}} options
function UTF8Encoder(options) {
var fatal = options.fatal;
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
if (inRange(code_point, 0xD800, 0xDFFF)) {
return encoderError(code_point);
if (inRange(code_point, 0x0000, 0x007f)) {
return output_byte_stream.emit(code_point);
var count, offset;
if (inRange(code_point, 0x0080, 0x07FF)) {
count = 1;
offset = 0xC0;
} else if (inRange(code_point, 0x0800, 0xFFFF)) {
count = 2;
offset = 0xE0;
} else if (inRange(code_point, 0x10000, 0x10FFFF)) {
count = 3;
offset = 0xF0;
var result = output_byte_stream.emit(
div(code_point, Math.pow(64, count)) + offset);
while (count > 0) {
var temp = div(code_point, Math.pow(64, count - 1));
result = output_byte_stream.emit(0x80 + (temp % 64));
count -= 1;
return result;
name_to_encoding['utf-8'].getEncoder = function(options) {
return new UTF8Encoder(options);
name_to_encoding['utf-8'].getDecoder = function(options) {
return new UTF8Decoder(options);
// Implementation of Text Encoding Web API
/** @const */ var DEFAULT_ENCODING = 'utf-8';
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {{fatal: boolean}=} options
function TextEncoder(opt_encoding, options) {
if (!(this instanceof TextEncoder)) {
throw new TypeError('Constructor cannot be called as a function');
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
if (this._encoding === null || ( !== 'utf-8' && !== 'utf-16le' && !== 'utf-16be'))
throw new TypeError('Unknown encoding: ' + opt_encoding);
/** @private @type {boolean} */
this._streaming = false;
/** @private */
this._encoder = null;
/** @private @type {{fatal: boolean}=} */
this._options = { fatal: Boolean(options.fatal) };
if (Object.defineProperty) {
this, 'encoding',
{ get: function() { return; } });
} else {
this.encoding =;
return this;
TextEncoder.prototype = {
* @param {string=} opt_string The string to encode.
* @param {{stream: boolean}=} options
encode: function encode(opt_string, options) {
opt_string = opt_string ? String(opt_string) : '';
options = Object(options);
// TODO: any options?
if (!this._streaming) {
this._encoder = this._encoding.getEncoder(this._options);
this._streaming = Boolean(;
var bytes = [];
var output_stream = new ByteOutputStream(bytes);
var input_stream = new CodePointInputStream(opt_string);
while (input_stream.get() !== EOF_code_point) {
this._encoder.encode(output_stream, input_stream);
if (!this._streaming) {
var last_byte;
do {
last_byte = this._encoder.encode(output_stream, input_stream);
} while (last_byte !== EOF_byte);
this._encoder = null;
return new Uint8Array(bytes);
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {{fatal: boolean}=} options
function TextDecoder(opt_encoding, options) {
if (!(this instanceof TextDecoder)) {
throw new TypeError('Constructor cannot be called as a function');
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
if (this._encoding === null)
throw new TypeError('Unknown encoding: ' + opt_encoding);
/** @private @type {boolean} */
this._streaming = false;
/** @private */
this._decoder = null;
/** @private @type {{fatal: boolean}=} */
this._options = { fatal: Boolean(options.fatal) };
if (Object.defineProperty) {
this, 'encoding',
{ get: function() { return; } });
} else {
this.encoding =;
return this;
// TODO: Issue if input byte stream is offset by decoder
// TODO: BOM detection will not work if stream header spans multiple calls
// (last N bytes of previous stream may need to be retained?)
TextDecoder.prototype = {
* @param {ArrayBufferView=} opt_view The buffer of bytes to decode.
* @param {{stream: boolean}=} options
decode: function decode(opt_view, options) {
if (opt_view && !('buffer' in opt_view && 'byteOffset' in opt_view &&
'byteLength' in opt_view)) {
throw new TypeError('Expected ArrayBufferView');
} else if (!opt_view) {
opt_view = new Uint8Array(0);
options = Object(options);
if (!this._streaming) {
this._decoder = this._encoding.getDecoder(this._options);
this._BOMseen = false;
this._streaming = Boolean(;
var bytes = new Uint8Array(opt_view.buffer,
var input_stream = new ByteInputStream(bytes);
var output_stream = new CodePointOutputStream(), code_point;
while (input_stream.get() !== EOF_byte) {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
if (!this._streaming) {
do {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
} while (code_point !== EOF_code_point &&
input_stream.get() != EOF_byte);
this._decoder = null;
var result = output_stream.string();
if (!this._BOMseen && result.length) {
this._BOMseen = true;
if (['utf-8', 'utf-16le', 'utf-16be'].indexOf(this.encoding) !== -1 &&
result.charCodeAt(0) === 0xFEFF) {
result = result.substring(1);
return result;
// Prefer native impl if available
module.exports = {
TextEncoder: global['TextEncoder'] || TextEncoder,
TextDecoder: global['TextDecoder'] || TextDecoder