fix: TextEncoder#encodeInto spec compliance + perf gains (#10129)

2024-12-22 07:14:47 -05:00 · 2021-05-08 18:31:40 -03:00 · 2021-05-08 18:31:40 -03:00 · 18a684ab1c
commit 18a684ab1c
parent a051a7f7bc
5 changed files with 215 additions and 91 deletions
--- a/cli/bench/main.rs
+++ b/cli/bench/main.rs
@ -89,6 +89,11 @@ const EXEC_TIME_BENCHMARKS: &[(&str, &[&str], Option<i32>)] = &[
    &["run", "cli/tests/text_encoder_perf.js"],
    None,
  ),
  (
    "text_encoder_into",
    &["run", "cli/tests/text_encoder_into_perf.js"],
    None,
  ),
  (
    "check",
    &[
--- a/cli/tests/text_encoder_into_perf.js
+++ b/cli/tests/text_encoder_into_perf.js
@ -0,0 +1,34 @@
 const mixed = "@Ā๐😀";
 function generateRandom(bytes) {
  let result = "";
  let i = 0;
  while (i < bytes) {
    const toAdd = Math.floor(Math.random() * Math.min(4, bytes - i));
    switch (toAdd) {
      case 0:
        result += mixed[0];
        i++;
        break;
      case 1:
        result += mixed[1];
        i++;
        break;
      case 2:
        result += mixed[2];
        i++;
        break;
      case 3:
        result += mixed[3];
        result += mixed[4];
        i += 2;
        break;
    }
  }
  return result;
 }
 const randomData = generateRandom(1024);
 const encoder = new TextEncoder();
 const targetBuffer = new Uint8Array(randomData.length * 4);
 for (let i = 0; i < 10_000; i++) encoder.encodeInto(randomData, targetBuffer);
--- a/cli/tests/unit/text_encoding_test.ts
+++ b/cli/tests/unit/text_encoding_test.ts
@ -157,6 +157,62 @@ unitTest(function textEncodeInto3(): void {
  ]);
 });
 unitTest(function loneSurrogateEncodeInto(): void {
  const fixture = "lone𝄞\ud888surrogate";
  const encoder = new TextEncoder();
  const bytes = new Uint8Array(20);
  const result = encoder.encodeInto(fixture, bytes);
  assertEquals(result.read, 16);
  assertEquals(result.written, 20);
  // deno-fmt-ignore
  assertEquals(Array.from(bytes), [
    0x6c, 0x6f, 0x6e, 0x65,
    0xf0, 0x9d, 0x84, 0x9e,
    0xef, 0xbf, 0xbd, 0x73,
    0x75, 0x72, 0x72, 0x6f,
    0x67, 0x61, 0x74, 0x65
  ]);
 });
 unitTest(function loneSurrogateEncodeInto2(): void {
  const fixture = "\ud800";
  const encoder = new TextEncoder();
  const bytes = new Uint8Array(3);
  const result = encoder.encodeInto(fixture, bytes);
  assertEquals(result.read, 1);
  assertEquals(result.written, 3);
  // deno-fmt-ignore
  assertEquals(Array.from(bytes), [
    0xef, 0xbf, 0xbd
  ]);
 });
 unitTest(function loneSurrogateEncodeInto3(): void {
  const fixture = "\udc00";
  const encoder = new TextEncoder();
  const bytes = new Uint8Array(3);
  const result = encoder.encodeInto(fixture, bytes);
  assertEquals(result.read, 1);
  assertEquals(result.written, 3);
  // deno-fmt-ignore
  assertEquals(Array.from(bytes), [
    0xef, 0xbf, 0xbd
  ]);
 });
 unitTest(function swappedSurrogatePairEncodeInto4(): void {
  const fixture = "\udc00\ud800";
  const encoder = new TextEncoder();
  const bytes = new Uint8Array(8);
  const result = encoder.encodeInto(fixture, bytes);
  assertEquals(result.read, 2);
  assertEquals(result.written, 6);
  // deno-fmt-ignore
  assertEquals(Array.from(bytes), [
    0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0x00, 0x00
  ]);
 });
 unitTest(function textDecoderSharedUint8Array(): void {
  const ab = new SharedArrayBuffer(6);
  const dataView = new DataView(ab);
--- a/extensions/web/08_text_encoding.js
+++ b/extensions/web/08_text_encoding.js
@ -48,51 +48,129 @@
    return inRange(a, 0x00, 0x7f);
  }
-  function stringToCodePoints(input) {
+  // Minor Unicode reference for readers.
-    const u = [];
+  //
-    for (const c of input) {
+  // Unicode code points are integers in the range 0x0 - 0x10ffff, (using at
-      u.push(c.codePointAt(0));
+  // most 21 bits). These integers are what rendering engines use to decide what
-    }
+  // glyphs are displayed on the screen. Since most code points use less than
-    return u;
+  // 21-bits, there are encodings that can represent code points more
-  }
+  // efficiently.
-
+  //
-  class UTF8Encoder {
+  // UTF-16 is one such encoding, and is used by Javascript engines to store
-    handler(codePoint) {
+  // strings internally. UTF-16 uses 1 or 2 16-bit integers (2 or 4 bytes) to
-      if (codePoint === END_OF_STREAM) {
+  // represent a single code point.
-        return "finished";
+  //
  // UTF-8 is another encoding, and uses 1, 2, 3 or 4 bytes to represent a
  // single code point.
  //
  // The goal of the function below is to transform UTF-16 into UTF-8 without
  // allocating any memory (writing to the buffer passed as parameter). The
  // conversion loop is roughly divided into 3 steps:
  //
  // - Decode UTF-16 into Unicode.
  // - Check if there's still enough space in the output buffer. If not, break
  //   out of the loop.
  // - Encode UTF-8 into the output buffer.
  //
  // Some references to learn more about the topic:
  // - https://dmitripavlutin.com/what-every-javascript-developer-should-know-about-unicode
  // - https://en.wikipedia.org/wiki/UTF-8
  // - https://en.wikipedia.org/wiki/UTF-16
  function encodeUtf8(input, output, state) {
    let { read, written } = state;
    const inLen = input.length;
    const outLen = output.length;
    while (read < inLen) {
      // Step 1: Decode the UTF-16 code unit(s) into an unicode code point.
      //
      // There are three possibilities here:
      // - The code unit is outside the high surrogate range and is treated as
      //   the code point.
      // - The code unit is in the high surrogate range and the next one
      //   is in the low surrogate range. The surrogate pair is combined into
      //   the final code point.
      // - The code unit is a lone surrogate (high or low) which is invalid in
      //   UTF-16. In this case it is replaced by 0xfffd (<28> )
      const badCodePoint = 0xfffd;
      const codeUnit = input.charCodeAt(read++);
      const surrogateMask = codeUnit & 0xfc00;
      let codePoint = codeUnit;
      if (surrogateMask === 0xd800) {
        // codeUnit is a high surrogate, check if there's a next character
        if (read < inLen) {
          // check if the next one is a low surrogate
          const nextCodeUnit = input.charCodeAt(read);
          if ((nextCodeUnit & 0xfc00) === 0xdc00) {
            // low surrogate, advance input offset and compute code point
            codePoint = 0x10000 +
              ((codeUnit & 0x3ff) << 10) + (nextCodeUnit & 0x3ff);
            read++;
          } else {
            // lone high surrogate
            codePoint = badCodePoint;
          }
        } else {
          // lone high surrogate
          codePoint = badCodePoint;
        }
      } else if (surrogateMask === 0xdc00) {
        // lone low surrogate
        codePoint = badCodePoint;
      }
-      if (inRange(codePoint, 0x00, 0x7f)) {
+      // Step 2: Check if there's available space to encode the code point as
-        return [codePoint];
+      // UTF-8. It will take at most 4 bytes, only need to check if the
      // available space is lower than that.
      const availableSpace = outLen - written;
      if (availableSpace < 4) {
        // Possibly not enough space, make the final decision based on the code
        // point range.
        if (
          availableSpace < 1 ||
          (availableSpace < 2 && codePoint >= 0x80) ||
          (availableSpace < 3 && codePoint >= 0x800) ||
          codePoint >= 0x10000
        ) {
          // Not enough space. Rewind read offset and bail out
          const isSurrogatePair = codePoint !== codeUnit &&
            codePoint !== badCodePoint;
          read -= isSurrogatePair ? 2 : 1;
          break;
        }
      }
-      let count;
+      // Step 3: Encode the code point as UTF-8 into the output buffer.
-      let offset;
+      if (codePoint < 0x80) {
-      if (inRange(codePoint, 0x0080, 0x07ff)) {
+        // 7 bits, encoded in 1 byte directly (0xxxxxxx).
-        count = 1;
+        output[written++] = codePoint;
-        offset = 0xc0;
+      } else if (codePoint < 0x800) {
-      } else if (inRange(codePoint, 0x0800, 0xffff)) {
+        // 11 bits, encode in 2 bytes where:
-        count = 2;
+        // byte 1: 110xxxxx (5 bits)
-        offset = 0xe0;
+        // byte 2: 10xxxxxx (6 bits)
-      } else if (inRange(codePoint, 0x10000, 0x10ffff)) {
+        output[written++] = 0xc0 | (0x1f & (codePoint >> 6));
-        count = 3;
+        output[written++] = 0x80 | (0x3f & (codePoint));
-        offset = 0xf0;
+      } else if (codePoint < 0x10000) {
        // 16 bits, encode in 3 bytes where:
        // byte 1: 1110xxxx (4 bits)
        // byte 2: 10xxxxxx (6 bits)
        // byte 3: 10xxxxxx (6 bits)
        output[written++] = 0xe0 | (0x0f & (codePoint >> 12));
        output[written++] = 0x80 | (0x3f & (codePoint >> 6));
        output[written++] = 0x80 | (0x3f & (codePoint));
      } else {
-        throw TypeError(
+        // 21 bits, encode in 4 bytes where:
-          `Code point out of range: \\x${codePoint.toString(16)}`,
+        // byte 1: 11110xxx (3 bits)
-        );
+        // byte 2: 10xxxxxx (6 bits)
        // byte 3: 10xxxxxx (6 bits)
        // byte 4: 10xxxxxx (6 bits)
        output[written++] = 0xf0 | (0x07 & (codePoint >> 18));
        output[written++] = 0x80 | (0x3f & (codePoint >> 12));
        output[written++] = 0x80 | (0x3f & (codePoint >> 6));
        output[written++] = 0x80 | (0x3f & (codePoint));
      }
      const bytes = [(codePoint >> (6 * count)) + offset];
      while (count > 0) {
        const temp = codePoint >> (6 * (count - 1));
        bytes.push(0x80 | (temp & 0x3f));
        count--;
      }
      return bytes;
    }
    state.read = read;
    state.written = written;
  }
  function atob(s) {
@ -4221,37 +4299,12 @@
          "2nd argument to TextEncoder.encodeInto must be Uint8Array",
        );
      }
      const state = { read: 0, written: 0 };
      if (dest.byteLength === 0) {
-        return { read: 0, written: 0 };
+        return state;
      }
-      const encoder = new UTF8Encoder();
+      encodeUtf8(input, dest, state);
-      const inputStream = new Stream(stringToCodePoints(input));
+      return state;
      let written = 0;
      let read = 0;
      while (true) {
        const item = inputStream.read();
        const result = encoder.handler(item);
        if (result === "finished") {
          break;
        }
        if (dest.length - written >= result.length) {
          read++;
          if (item > 0xFFFF) {
            // increment read a second time if greater than U+FFFF
            read++;
          }
          dest.set(result, written);
          written += result.length;
        } else {
          break;
        }
      }
      return {
        read,
        written,
      };
    }
    get [Symbol.toStringTag]() {
      return "TextEncoder";
--- a/tools/wpt/expectation.json
+++ b/tools/wpt/expectation.json
@ -39,30 +39,6 @@
    "api-replacement-encodings.any.js": true,
    "api-surrogates-utf8.any.js": true,
    "encodeInto.any.js": [
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
      "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
      "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
      "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
      "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
      "encodeInto() and a detached output buffer"
    ],
    "idlharness.any.js": [