Use alternate TextEncoder/TextDecoder implementation (#1281)

This is faster and smaller.
2024-11-25 15:29:32 -05:00 · 2018-12-07 05:01:15 +11:00 · 2018-12-07 05:01:15 +11:00 · 6cc89b9e27
commit 6cc89b9e27
parent 60c008d23b
9 changed files with 366 additions and 42 deletions
--- a/js/blob.ts
+++ b/js/blob.ts
@ -1,6 +1,7 @@
 // Copyright 2018 the Deno authors. All rights reserved. MIT license.
 import * as domTypes from "./dom_types";
 import { containsOnlyASCII } from "./util";
+import { TextEncoder } from "./text_encoding";

 const bytesSymbol = Symbol("bytes");

--- a/js/dom_types.ts
+++ b/js/dom_types.ts
@ -13,6 +13,8 @@ See the Apache Version 2.0 License for specific language governing permissions
 and limitations under the License.
 *******************************************************************************/

+export type BufferSource = ArrayBufferView | ArrayBuffer;
+
 export type HeadersInit =
  | Headers
  | Array<[string, string]>
--- a/js/fetch.ts
+++ b/js/fetch.ts
@ -4,7 +4,7 @@ import * as flatbuffers from "./flatbuffers";
 import { sendAsync } from "./dispatch";
 import * as msg from "gen/msg_generated";
 import * as domTypes from "./dom_types";
-import { TextDecoder } from "./text_encoding";
+import { TextDecoder, TextEncoder } from "./text_encoding";
 import { DenoBlob } from "./blob";
 import { Headers } from "./headers";
 import * as io from "./io";
--- a/js/globals.ts
+++ b/js/globals.ts
@ -29,8 +29,6 @@ import { libdeno } from "./libdeno";
 declare global {
  const console: consoleTypes.Console;
  const setTimeout: typeof timers.setTimeout;
-  // tslint:disable-next-line:variable-name
-  const TextEncoder: typeof textEncoding.TextEncoder;
 }

 // A reference to the global object.
@ -69,7 +67,7 @@ export type Headers = domTypes.Headers;
 window.FormData = formData.FormData as domTypes.FormDataConstructor;
 export type FormData = domTypes.FormData;

-// While these are classes, they have their global instance types created in
-// other type definitions, therefore we do not have to include them here.
 window.TextEncoder = textEncoding.TextEncoder;
+export type TextEncoder = textEncoding.TextEncoder;
 window.TextDecoder = textEncoding.TextDecoder;
+export type TextDecoder = textEncoding.TextDecoder;
--- a/js/text_encoding.ts
+++ b/js/text_encoding.ts
@ -1,5 +1,29 @@
-// Copyright 2018 the Deno authors. All rights reserved. MIT license.
+// The following code is based off of text-encoding at:
+// https://github.com/inexorabletash/text-encoding
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+//
+// In jurisdictions that recognize copyright laws, the author or authors
+// of this software dedicate any and all copyright interest in the
+// software to the public domain. We make this dedication for the benefit
+// of the public at large and to the detriment of our heirs and
+// successors. We intend this dedication to be an overt act of
+// relinquishment in perpetuity of all present and future rights to this
+// software under copyright law.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
 import * as base64 from "base64-js";
+import * as domTypes from "./dom_types";
 import { DenoError, ErrorKind } from "./errors";

 /** Decodes a string of data which has been encoded using base-64. */
@ -43,29 +67,299 @@ export function btoa(s: string): string {
  return result;
 }

-// @types/text-encoding relies on lib.dom.d.ts for some interfaces. We do not
-// want to include lib.dom.d.ts (due to size) into deno's global type scope.
-// Therefore this hack: add a few of the missing interfaces in
-// @types/text-encoding to the global scope before importing.
+interface Decoder {
+  handler(stream: Stream, byte: number): number | number[] | null;
+}

-declare global {
-  type BufferSource = ArrayBufferView | ArrayBuffer;
+interface Encoder {
+  handler(codePoint: number): number | number[];
+}

-  interface TextDecodeOptions {
-    stream?: boolean;
+const CONTINUE = null;
+const END_OF_STREAM = -1;
+const FINISHED = -1;
+
+function codePointsToString(codePoints: number[]): string {
+  let s = "";
+  for (const cp of codePoints) {
+    s += String.fromCodePoint(cp);
+  }
+  return s;
+}
+
+function decoderError(fatal: boolean): number | never {
+  if (fatal) {
+    throw new TypeError("Decoder error.");
+  }
+  return 0xfffd; // default code point
+}
+
+function inRange(a: number, min: number, max: number) {
+  return min <= a && a <= max;
+}
+
+function stringToCodePoints(input: string): number[] {
+  const u: number[] = [];
+  for (const c of input) {
+    u.push(c.codePointAt(0)!);
+  }
+  return u;
+}
+
+class Stream {
+  private _tokens: number[];
+  constructor(tokens: number[] | Uint8Array) {
+    this._tokens = [].slice.call(tokens);
+    this._tokens.reverse();
  }

-  interface TextDecoderOptions {
-    fatal?: boolean;
-    ignoreBOM?: boolean;
+  endOfStream(): boolean {
+    return !this._tokens.length;
  }

-  interface TextDecoder {
-    readonly encoding: string;
-    readonly fatal: boolean;
-    readonly ignoreBOM: boolean;
-    decode(input?: BufferSource, options?: TextDecodeOptions): string;
+  read(): number {
+    return !this._tokens.length ? END_OF_STREAM : this._tokens.pop()!;
+  }
+
+  prepend(token: number | number[]): void {
+    if (Array.isArray(token)) {
+      while (token.length) {
+        this._tokens.push(token.pop()!);
+      }
+    } else {
+      this._tokens.push(token);
+    }
+  }
+
+  push(token: number | number[]): void {
+    if (Array.isArray(token)) {
+      while (token.length) {
+        this._tokens.unshift(token.shift()!);
+      }
+    } else {
+      this._tokens.unshift(token);
+    }
  }
 }

-export { TextEncoder, TextDecoder } from "text-encoding";
+class UTF8Decoder implements Decoder {
+  private _codePoint = 0;
+  private _bytesSeen = 0;
+  private _bytesNeeded = 0;
+  private _fatal: boolean;
+  private _lowerBoundary = 0x80;
+  private _upperBoundary = 0xbf;
+
+  constructor(options = { fatal: false }) {
+    this._fatal = options.fatal;
+  }
+
+  handler(stream: Stream, byte: number): number | null {
+    if (byte === END_OF_STREAM && this._bytesNeeded !== 0) {
+      this._bytesNeeded = 0;
+      return decoderError(this._fatal);
+    }
+
+    if (byte === END_OF_STREAM) {
+      return FINISHED;
+    }
+
+    if (this._bytesNeeded === 0) {
+      if (inRange(byte, 0x00, 0x7f)) {
+        // Single byte code point
+        return byte;
+      } else if (inRange(byte, 0xc2, 0xdf)) {
+        // Two byte code point
+        this._bytesNeeded = 1;
+        this._codePoint = byte & 0x1f;
+      } else if (inRange(byte, 0xe0, 0xef)) {
+        // Three byte code point
+        if (byte === 0xe0) {
+          this._lowerBoundary = 0xa0;
+        } else if (byte === 0xed) {
+          this._upperBoundary = 0x9f;
+        }
+        this._bytesNeeded = 2;
+        this._codePoint = byte & 0xf;
+      } else if (inRange(byte, 0xf0, 0xf4)) {
+        if (byte === 0xf0) {
+          this._lowerBoundary = 0x90;
+        } else if (byte === 0xf4) {
+          this._upperBoundary = 0x8f;
+        }
+        this._bytesNeeded = 3;
+        this._codePoint = byte & 0x7;
+      } else {
+        return decoderError(this._fatal);
+      }
+      return CONTINUE;
+    }
+
+    if (!inRange(byte, this._lowerBoundary, this._upperBoundary)) {
+      // Byte out of range, so encoding error
+      this._codePoint = 0;
+      this._bytesNeeded = 0;
+      this._bytesSeen = 0;
+      stream.prepend(byte);
+      return decoderError(this._fatal);
+    }
+
+    this._lowerBoundary = 0x80;
+    this._upperBoundary = 0xbf;
+
+    this._codePoint = (this._codePoint << 6) | (byte & 0x3f);
+
+    this._bytesSeen++;
+
+    if (this._bytesSeen !== this._bytesNeeded) {
+      return CONTINUE;
+    }
+
+    const codePoint = this._codePoint;
+
+    this._codePoint = 0;
+    this._bytesNeeded = 0;
+    this._bytesSeen = 0;
+
+    return codePoint;
+  }
+}
+
+class UTF8Encoder implements Encoder {
+  handler(codePoint: number): number | number[] {
+    if (codePoint === END_OF_STREAM) {
+      return FINISHED;
+    }
+
+    if (inRange(codePoint, 0x00, 0x7f)) {
+      return codePoint;
+    }
+
+    let count: number;
+    let offset: number;
+    if (inRange(codePoint, 0x0080, 0x07ff)) {
+      count = 1;
+      offset = 0xc0;
+    } else if (inRange(codePoint, 0x0800, 0xffff)) {
+      count = 2;
+      offset = 0xe0;
+    } else if (inRange(codePoint, 0x10000, 0x10ffff)) {
+      count = 3;
+      offset = 0xf0;
+    } else {
+      throw TypeError(`Code point out of range: \\x${codePoint.toString(16)}`);
+    }
+
+    const bytes = [(codePoint >> (6 * count)) + offset];
+
+    while (count > 0) {
+      const temp = codePoint >> (6 * (count - 1));
+      bytes.push(0x80 | (temp & 0x3f));
+      count--;
+    }
+
+    return bytes;
+  }
+}
+
+export interface TextDecodeOptions {
+  stream?: false;
+}
+
+export interface TextDecoderOptions {
+  fatal?: boolean;
+  ignoreBOM?: false;
+}
+
+export class TextDecoder {
+  /** Returns encoding's name, lowercased. */
+  readonly encoding = "utf-8";
+  /** Returns `true` if error mode is "fatal", and `false` otherwise. */
+  readonly fatal: boolean = false;
+  /** Returns `true` if ignore BOM flag is set, and `false` otherwise. */
+  readonly ignoreBOM = false;
+
+  constructor(
+    label: "utf-8" = "utf-8",
+    options: TextDecoderOptions = { fatal: false }
+  ) {
+    if (label !== "utf-8") {
+      throw new TypeError("Only UTF8 decoding supported.");
+    }
+    if (options.ignoreBOM) {
+      throw new TypeError("Ignoring the BOM not supported.");
+    }
+    if (options.fatal) {
+      this.fatal = true;
+    }
+  }
+
+  /** Returns the result of running encoding's decoder. */
+  decode(
+    input?: domTypes.BufferSource,
+    options: TextDecodeOptions = { stream: false }
+  ): string {
+    if (options.stream) {
+      throw new TypeError("Stream not supported.");
+    }
+
+    let bytes: Uint8Array;
+    if (typeof input === "object" && input instanceof ArrayBuffer) {
+      bytes = new Uint8Array(input);
+    } else if (
+      typeof input === "object" &&
+      "buffer" in input &&
+      input.buffer instanceof ArrayBuffer
+    ) {
+      bytes = new Uint8Array(input.buffer, input.byteOffset, input.byteLength);
+    } else {
+      bytes = new Uint8Array(0);
+    }
+
+    const decoder = new UTF8Decoder({ fatal: this.fatal });
+    const inputStream = new Stream(bytes);
+    const output: number[] = [];
+
+    while (true) {
+      const result = decoder.handler(inputStream, inputStream.read());
+      if (result === FINISHED) {
+        break;
+      }
+
+      if (result !== CONTINUE) {
+        output.push(result);
+      }
+    }
+
+    if (output.length > 0 && output[0] === 0xfeff) {
+      output.shift();
+    }
+
+    return codePointsToString(output);
+  }
+}
+
+export class TextEncoder {
+  /** Returns "utf-8". */
+  readonly encoding = "utf-8";
+  /** Returns the result of running UTF-8's encoder. */
+  encode(input = ""): Uint8Array {
+    const encoder = new UTF8Encoder();
+    const inputStream = new Stream(stringToCodePoints(input));
+    const output: number[] = [];
+
+    while (true) {
+      const result = encoder.handler(inputStream.read());
+      if (result === FINISHED) {
+        break;
+      }
+      if (Array.isArray(result)) {
+        output.push.apply(output, result);
+      } else {
+        output.push(result);
+      }
+    }
+
+    return new Uint8Array(output);
+  }
+}
--- a/js/text_encoding_test.ts
+++ b/js/text_encoding_test.ts
@ -24,3 +24,49 @@ test(function btoaFailed() {
  assert(!!err);
  assertEqual(err.name, "InvalidInput");
 });
+
+test(function textDecoder() {
+  // prettier-ignore
+  const fixture = new Uint8Array([
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd
+  ]);
+  const decoder = new TextDecoder();
+  assertEqual(decoder.decode(fixture), "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
+});
+
+test(function textDecoder2() {
+  // prettier-ignore
+  const fixture = new Uint8Array([
+    0xf0, 0x9d, 0x93, 0xbd,
+    0xf0, 0x9d, 0x93, 0xae,
+    0xf0, 0x9d, 0x94, 0x81,
+    0xf0, 0x9d, 0x93, 0xbd
+  ]);
+  const decoder = new TextDecoder();
+  assertEqual(decoder.decode(fixture), "𝓽𝓮𝔁𝓽");
+});
+
+test(function textEncoder() {
+  const fixture = "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>";
+  const encoder = new TextEncoder();
+  // prettier-ignore
+  assertEqual(Array.from(encoder.encode(fixture)), [
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
+    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd
+  ]);
+});
+
+test(function textEncoder2() {
+  const fixture = "𝓽𝓮𝔁𝓽";
+  const encoder = new TextEncoder();
+  // prettier-ignore
+  assertEqual(Array.from(encoder.encode(fixture)), [
+    0xf0, 0x9d, 0x93, 0xbd,
+    0xf0, 0x9d, 0x93, 0xae,
+    0xf0, 0x9d, 0x94, 0x81,
+    0xf0, 0x9d, 0x93, 0xbd
+  ]);
+});
--- a/package.json
+++ b/package.json
@ -20,7 +20,6 @@
    "rollup-plugin-typescript2": "^0.16.1",
    "rollup-pluginutils": "^2.3.0",
    "source-map-support": "^0.5.6",
-    "text-encoding": "0.6.4",
    "ts-node": "^7.0.1",
    "ts-simple-ast": "17.1.0",
    "tslint": "^5.10.0",
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit d8123728834250395e859b10618ad2ca35f7a555
+Subproject commit e058979631fd3ecc55f8995a02eaa6ff8f35c321
--- a/tools/ts_library_builder/build_library.ts
+++ b/tools/ts_library_builder/build_library.ts
@ -13,7 +13,6 @@ import {
  addInterfaceProperty,
  addSourceComment,
  addVariableDeclaration,
-  appendSourceFile,
  checkDiagnostics,
  flattenNamespace,
  getSourceComment,
@ -370,18 +369,13 @@ export function main({
      moduleResolution: ModuleResolutionKind.NodeJs,
      noLib: true,
      strict: true,
-      target: ScriptTarget.ESNext,
-      types: ["text-encoding"]
+      target: ScriptTarget.ESNext
    },
    useVirtualFileSystem: true
  });

  // There are files we need to load into memory, so that the project "compiles"
  loadDtsFiles(outputProject);
-  // tslint:disable-next-line:max-line-length
-  const textEncodingFilePath = `${buildPath}/node_modules/@types/text-encoding/index.d.ts`;
-  loadFiles(outputProject, [textEncodingFilePath]);
-  outputProject.addExistingSourceFileIfExists(textEncodingFilePath);

  // libDts is the final output file we are looking to build and we are not
  // actually creating it, only in memory at this stage.
@ -433,16 +427,6 @@ export function main({
    console.log(`Merged "globals" into global scope.`);
  }

-  // Since we flatten the namespaces, we don't attempt to import `text-encoding`
-  // so we then need to concatenate that onto the `libDts` so it can stand on
-  // its own.
-  const textEncodingSourceFile = outputProject.getSourceFileOrThrow(
-    textEncodingFilePath
-  );
-  appendSourceFile(textEncodingSourceFile, libDTs);
-  // Removing it from the project so we know the libDTs can stand on its own.
-  outputProject.removeSourceFile(textEncodingSourceFile);
-
  // Add the preamble
  libDTs.insertStatements(0, libPreamble);