From 6cc89b9e272440d93b6354f098031c3a22803686 Mon Sep 17 00:00:00 2001 From: Kitson Kelly Date: Fri, 7 Dec 2018 05:01:15 +1100 Subject: [PATCH] Use alternate TextEncoder/TextDecoder implementation (#1281) This is faster and smaller. --- js/blob.ts | 1 + js/dom_types.ts | 2 + js/fetch.ts | 2 +- js/globals.ts | 6 +- js/text_encoding.ts | 330 ++++++++++++++++++++-- js/text_encoding_test.ts | 46 +++ package.json | 1 - third_party | 2 +- tools/ts_library_builder/build_library.ts | 18 +- 9 files changed, 366 insertions(+), 42 deletions(-) diff --git a/js/blob.ts b/js/blob.ts index 8dcc48ba27..4717a28fbb 100644 --- a/js/blob.ts +++ b/js/blob.ts @@ -1,6 +1,7 @@ // Copyright 2018 the Deno authors. All rights reserved. MIT license. import * as domTypes from "./dom_types"; import { containsOnlyASCII } from "./util"; +import { TextEncoder } from "./text_encoding"; const bytesSymbol = Symbol("bytes"); diff --git a/js/dom_types.ts b/js/dom_types.ts index 22f704ee4d..a033dc3764 100644 --- a/js/dom_types.ts +++ b/js/dom_types.ts @@ -13,6 +13,8 @@ See the Apache Version 2.0 License for specific language governing permissions and limitations under the License. *******************************************************************************/ +export type BufferSource = ArrayBufferView | ArrayBuffer; + export type HeadersInit = | Headers | Array<[string, string]> diff --git a/js/fetch.ts b/js/fetch.ts index 06ed5dbca6..58e16f8ee0 100644 --- a/js/fetch.ts +++ b/js/fetch.ts @@ -4,7 +4,7 @@ import * as flatbuffers from "./flatbuffers"; import { sendAsync } from "./dispatch"; import * as msg from "gen/msg_generated"; import * as domTypes from "./dom_types"; -import { TextDecoder } from "./text_encoding"; +import { TextDecoder, TextEncoder } from "./text_encoding"; import { DenoBlob } from "./blob"; import { Headers } from "./headers"; import * as io from "./io"; diff --git a/js/globals.ts b/js/globals.ts index 6f29d97d49..cf45b4239d 100644 --- a/js/globals.ts +++ b/js/globals.ts @@ -29,8 +29,6 @@ import { libdeno } from "./libdeno"; declare global { const console: consoleTypes.Console; const setTimeout: typeof timers.setTimeout; - // tslint:disable-next-line:variable-name - const TextEncoder: typeof textEncoding.TextEncoder; } // A reference to the global object. @@ -69,7 +67,7 @@ export type Headers = domTypes.Headers; window.FormData = formData.FormData as domTypes.FormDataConstructor; export type FormData = domTypes.FormData; -// While these are classes, they have their global instance types created in -// other type definitions, therefore we do not have to include them here. window.TextEncoder = textEncoding.TextEncoder; +export type TextEncoder = textEncoding.TextEncoder; window.TextDecoder = textEncoding.TextDecoder; +export type TextDecoder = textEncoding.TextDecoder; diff --git a/js/text_encoding.ts b/js/text_encoding.ts index 72a355d5fa..7cd324c726 100644 --- a/js/text_encoding.ts +++ b/js/text_encoding.ts @@ -1,5 +1,29 @@ -// Copyright 2018 the Deno authors. All rights reserved. MIT license. +// The following code is based off of text-encoding at: +// https://github.com/inexorabletash/text-encoding +// +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + import * as base64 from "base64-js"; +import * as domTypes from "./dom_types"; import { DenoError, ErrorKind } from "./errors"; /** Decodes a string of data which has been encoded using base-64. */ @@ -43,29 +67,299 @@ export function btoa(s: string): string { return result; } -// @types/text-encoding relies on lib.dom.d.ts for some interfaces. We do not -// want to include lib.dom.d.ts (due to size) into deno's global type scope. -// Therefore this hack: add a few of the missing interfaces in -// @types/text-encoding to the global scope before importing. +interface Decoder { + handler(stream: Stream, byte: number): number | number[] | null; +} -declare global { - type BufferSource = ArrayBufferView | ArrayBuffer; +interface Encoder { + handler(codePoint: number): number | number[]; +} - interface TextDecodeOptions { - stream?: boolean; +const CONTINUE = null; +const END_OF_STREAM = -1; +const FINISHED = -1; + +function codePointsToString(codePoints: number[]): string { + let s = ""; + for (const cp of codePoints) { + s += String.fromCodePoint(cp); + } + return s; +} + +function decoderError(fatal: boolean): number | never { + if (fatal) { + throw new TypeError("Decoder error."); + } + return 0xfffd; // default code point +} + +function inRange(a: number, min: number, max: number) { + return min <= a && a <= max; +} + +function stringToCodePoints(input: string): number[] { + const u: number[] = []; + for (const c of input) { + u.push(c.codePointAt(0)!); + } + return u; +} + +class Stream { + private _tokens: number[]; + constructor(tokens: number[] | Uint8Array) { + this._tokens = [].slice.call(tokens); + this._tokens.reverse(); } - interface TextDecoderOptions { - fatal?: boolean; - ignoreBOM?: boolean; + endOfStream(): boolean { + return !this._tokens.length; } - interface TextDecoder { - readonly encoding: string; - readonly fatal: boolean; - readonly ignoreBOM: boolean; - decode(input?: BufferSource, options?: TextDecodeOptions): string; + read(): number { + return !this._tokens.length ? END_OF_STREAM : this._tokens.pop()!; + } + + prepend(token: number | number[]): void { + if (Array.isArray(token)) { + while (token.length) { + this._tokens.push(token.pop()!); + } + } else { + this._tokens.push(token); + } + } + + push(token: number | number[]): void { + if (Array.isArray(token)) { + while (token.length) { + this._tokens.unshift(token.shift()!); + } + } else { + this._tokens.unshift(token); + } } } -export { TextEncoder, TextDecoder } from "text-encoding"; +class UTF8Decoder implements Decoder { + private _codePoint = 0; + private _bytesSeen = 0; + private _bytesNeeded = 0; + private _fatal: boolean; + private _lowerBoundary = 0x80; + private _upperBoundary = 0xbf; + + constructor(options = { fatal: false }) { + this._fatal = options.fatal; + } + + handler(stream: Stream, byte: number): number | null { + if (byte === END_OF_STREAM && this._bytesNeeded !== 0) { + this._bytesNeeded = 0; + return decoderError(this._fatal); + } + + if (byte === END_OF_STREAM) { + return FINISHED; + } + + if (this._bytesNeeded === 0) { + if (inRange(byte, 0x00, 0x7f)) { + // Single byte code point + return byte; + } else if (inRange(byte, 0xc2, 0xdf)) { + // Two byte code point + this._bytesNeeded = 1; + this._codePoint = byte & 0x1f; + } else if (inRange(byte, 0xe0, 0xef)) { + // Three byte code point + if (byte === 0xe0) { + this._lowerBoundary = 0xa0; + } else if (byte === 0xed) { + this._upperBoundary = 0x9f; + } + this._bytesNeeded = 2; + this._codePoint = byte & 0xf; + } else if (inRange(byte, 0xf0, 0xf4)) { + if (byte === 0xf0) { + this._lowerBoundary = 0x90; + } else if (byte === 0xf4) { + this._upperBoundary = 0x8f; + } + this._bytesNeeded = 3; + this._codePoint = byte & 0x7; + } else { + return decoderError(this._fatal); + } + return CONTINUE; + } + + if (!inRange(byte, this._lowerBoundary, this._upperBoundary)) { + // Byte out of range, so encoding error + this._codePoint = 0; + this._bytesNeeded = 0; + this._bytesSeen = 0; + stream.prepend(byte); + return decoderError(this._fatal); + } + + this._lowerBoundary = 0x80; + this._upperBoundary = 0xbf; + + this._codePoint = (this._codePoint << 6) | (byte & 0x3f); + + this._bytesSeen++; + + if (this._bytesSeen !== this._bytesNeeded) { + return CONTINUE; + } + + const codePoint = this._codePoint; + + this._codePoint = 0; + this._bytesNeeded = 0; + this._bytesSeen = 0; + + return codePoint; + } +} + +class UTF8Encoder implements Encoder { + handler(codePoint: number): number | number[] { + if (codePoint === END_OF_STREAM) { + return FINISHED; + } + + if (inRange(codePoint, 0x00, 0x7f)) { + return codePoint; + } + + let count: number; + let offset: number; + if (inRange(codePoint, 0x0080, 0x07ff)) { + count = 1; + offset = 0xc0; + } else if (inRange(codePoint, 0x0800, 0xffff)) { + count = 2; + offset = 0xe0; + } else if (inRange(codePoint, 0x10000, 0x10ffff)) { + count = 3; + offset = 0xf0; + } else { + throw TypeError(`Code point out of range: \\x${codePoint.toString(16)}`); + } + + const bytes = [(codePoint >> (6 * count)) + offset]; + + while (count > 0) { + const temp = codePoint >> (6 * (count - 1)); + bytes.push(0x80 | (temp & 0x3f)); + count--; + } + + return bytes; + } +} + +export interface TextDecodeOptions { + stream?: false; +} + +export interface TextDecoderOptions { + fatal?: boolean; + ignoreBOM?: false; +} + +export class TextDecoder { + /** Returns encoding's name, lowercased. */ + readonly encoding = "utf-8"; + /** Returns `true` if error mode is "fatal", and `false` otherwise. */ + readonly fatal: boolean = false; + /** Returns `true` if ignore BOM flag is set, and `false` otherwise. */ + readonly ignoreBOM = false; + + constructor( + label: "utf-8" = "utf-8", + options: TextDecoderOptions = { fatal: false } + ) { + if (label !== "utf-8") { + throw new TypeError("Only UTF8 decoding supported."); + } + if (options.ignoreBOM) { + throw new TypeError("Ignoring the BOM not supported."); + } + if (options.fatal) { + this.fatal = true; + } + } + + /** Returns the result of running encoding's decoder. */ + decode( + input?: domTypes.BufferSource, + options: TextDecodeOptions = { stream: false } + ): string { + if (options.stream) { + throw new TypeError("Stream not supported."); + } + + let bytes: Uint8Array; + if (typeof input === "object" && input instanceof ArrayBuffer) { + bytes = new Uint8Array(input); + } else if ( + typeof input === "object" && + "buffer" in input && + input.buffer instanceof ArrayBuffer + ) { + bytes = new Uint8Array(input.buffer, input.byteOffset, input.byteLength); + } else { + bytes = new Uint8Array(0); + } + + const decoder = new UTF8Decoder({ fatal: this.fatal }); + const inputStream = new Stream(bytes); + const output: number[] = []; + + while (true) { + const result = decoder.handler(inputStream, inputStream.read()); + if (result === FINISHED) { + break; + } + + if (result !== CONTINUE) { + output.push(result); + } + } + + if (output.length > 0 && output[0] === 0xfeff) { + output.shift(); + } + + return codePointsToString(output); + } +} + +export class TextEncoder { + /** Returns "utf-8". */ + readonly encoding = "utf-8"; + /** Returns the result of running UTF-8's encoder. */ + encode(input = ""): Uint8Array { + const encoder = new UTF8Encoder(); + const inputStream = new Stream(stringToCodePoints(input)); + const output: number[] = []; + + while (true) { + const result = encoder.handler(inputStream.read()); + if (result === FINISHED) { + break; + } + if (Array.isArray(result)) { + output.push.apply(output, result); + } else { + output.push(result); + } + } + + return new Uint8Array(output); + } +} diff --git a/js/text_encoding_test.ts b/js/text_encoding_test.ts index 7a9aec833a..402044e369 100644 --- a/js/text_encoding_test.ts +++ b/js/text_encoding_test.ts @@ -24,3 +24,49 @@ test(function btoaFailed() { assert(!!err); assertEqual(err.name, "InvalidInput"); }); + +test(function textDecoder() { + // prettier-ignore + const fixture = new Uint8Array([ + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd + ]); + const decoder = new TextDecoder(); + assertEqual(decoder.decode(fixture), "������"); +}); + +test(function textDecoder2() { + // prettier-ignore + const fixture = new Uint8Array([ + 0xf0, 0x9d, 0x93, 0xbd, + 0xf0, 0x9d, 0x93, 0xae, + 0xf0, 0x9d, 0x94, 0x81, + 0xf0, 0x9d, 0x93, 0xbd + ]); + const decoder = new TextDecoder(); + assertEqual(decoder.decode(fixture), "𝓽𝓮𝔁𝓽"); +}); + +test(function textEncoder() { + const fixture = "������"; + const encoder = new TextEncoder(); + // prettier-ignore + assertEqual(Array.from(encoder.encode(fixture)), [ + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd + ]); +}); + +test(function textEncoder2() { + const fixture = "𝓽𝓮𝔁𝓽"; + const encoder = new TextEncoder(); + // prettier-ignore + assertEqual(Array.from(encoder.encode(fixture)), [ + 0xf0, 0x9d, 0x93, 0xbd, + 0xf0, 0x9d, 0x93, 0xae, + 0xf0, 0x9d, 0x94, 0x81, + 0xf0, 0x9d, 0x93, 0xbd + ]); +}); diff --git a/package.json b/package.json index df6ae9cfa3..e591e09b2f 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,6 @@ "rollup-plugin-typescript2": "^0.16.1", "rollup-pluginutils": "^2.3.0", "source-map-support": "^0.5.6", - "text-encoding": "0.6.4", "ts-node": "^7.0.1", "ts-simple-ast": "17.1.0", "tslint": "^5.10.0", diff --git a/third_party b/third_party index d812372883..e058979631 160000 --- a/third_party +++ b/third_party @@ -1 +1 @@ -Subproject commit d8123728834250395e859b10618ad2ca35f7a555 +Subproject commit e058979631fd3ecc55f8995a02eaa6ff8f35c321 diff --git a/tools/ts_library_builder/build_library.ts b/tools/ts_library_builder/build_library.ts index e1a64215f0..00e9425407 100644 --- a/tools/ts_library_builder/build_library.ts +++ b/tools/ts_library_builder/build_library.ts @@ -13,7 +13,6 @@ import { addInterfaceProperty, addSourceComment, addVariableDeclaration, - appendSourceFile, checkDiagnostics, flattenNamespace, getSourceComment, @@ -370,18 +369,13 @@ export function main({ moduleResolution: ModuleResolutionKind.NodeJs, noLib: true, strict: true, - target: ScriptTarget.ESNext, - types: ["text-encoding"] + target: ScriptTarget.ESNext }, useVirtualFileSystem: true }); // There are files we need to load into memory, so that the project "compiles" loadDtsFiles(outputProject); - // tslint:disable-next-line:max-line-length - const textEncodingFilePath = `${buildPath}/node_modules/@types/text-encoding/index.d.ts`; - loadFiles(outputProject, [textEncodingFilePath]); - outputProject.addExistingSourceFileIfExists(textEncodingFilePath); // libDts is the final output file we are looking to build and we are not // actually creating it, only in memory at this stage. @@ -433,16 +427,6 @@ export function main({ console.log(`Merged "globals" into global scope.`); } - // Since we flatten the namespaces, we don't attempt to import `text-encoding` - // so we then need to concatenate that onto the `libDts` so it can stand on - // its own. - const textEncodingSourceFile = outputProject.getSourceFileOrThrow( - textEncodingFilePath - ); - appendSourceFile(textEncodingSourceFile, libDTs); - // Removing it from the project so we know the libDTs can stand on its own. - outputProject.removeSourceFile(textEncodingSourceFile); - // Add the preamble libDTs.insertStatements(0, libPreamble);