From 18a684ab1c20914e13c27bc10e20bda6396ea38d Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 8 May 2021 18:31:40 -0300 Subject: [PATCH] fix: TextEncoder#encodeInto spec compliance + perf gains (#10129) --- cli/bench/main.rs | 5 + cli/tests/text_encoder_into_perf.js | 34 +++++ cli/tests/unit/text_encoding_test.ts | 56 ++++++++ extensions/web/08_text_encoding.js | 187 +++++++++++++++++---------- tools/wpt/expectation.json | 24 ---- 5 files changed, 215 insertions(+), 91 deletions(-) create mode 100644 cli/tests/text_encoder_into_perf.js diff --git a/cli/bench/main.rs b/cli/bench/main.rs index f2ade54d80..b98a9d141e 100644 --- a/cli/bench/main.rs +++ b/cli/bench/main.rs @@ -89,6 +89,11 @@ const EXEC_TIME_BENCHMARKS: &[(&str, &[&str], Option)] = &[ &["run", "cli/tests/text_encoder_perf.js"], None, ), + ( + "text_encoder_into", + &["run", "cli/tests/text_encoder_into_perf.js"], + None, + ), ( "check", &[ diff --git a/cli/tests/text_encoder_into_perf.js b/cli/tests/text_encoder_into_perf.js new file mode 100644 index 0000000000..8d60e9f000 --- /dev/null +++ b/cli/tests/text_encoder_into_perf.js @@ -0,0 +1,34 @@ +const mixed = "@Ā๐😀"; + +function generateRandom(bytes) { + let result = ""; + let i = 0; + while (i < bytes) { + const toAdd = Math.floor(Math.random() * Math.min(4, bytes - i)); + switch (toAdd) { + case 0: + result += mixed[0]; + i++; + break; + case 1: + result += mixed[1]; + i++; + break; + case 2: + result += mixed[2]; + i++; + break; + case 3: + result += mixed[3]; + result += mixed[4]; + i += 2; + break; + } + } + return result; +} + +const randomData = generateRandom(1024); +const encoder = new TextEncoder(); +const targetBuffer = new Uint8Array(randomData.length * 4); +for (let i = 0; i < 10_000; i++) encoder.encodeInto(randomData, targetBuffer); diff --git a/cli/tests/unit/text_encoding_test.ts b/cli/tests/unit/text_encoding_test.ts index 7a15c93768..42c221cb21 100644 --- a/cli/tests/unit/text_encoding_test.ts +++ b/cli/tests/unit/text_encoding_test.ts @@ -157,6 +157,62 @@ unitTest(function textEncodeInto3(): void { ]); }); +unitTest(function loneSurrogateEncodeInto(): void { + const fixture = "lone𝄞\ud888surrogate"; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(20); + const result = encoder.encodeInto(fixture, bytes); + assertEquals(result.read, 16); + assertEquals(result.written, 20); + // deno-fmt-ignore + assertEquals(Array.from(bytes), [ + 0x6c, 0x6f, 0x6e, 0x65, + 0xf0, 0x9d, 0x84, 0x9e, + 0xef, 0xbf, 0xbd, 0x73, + 0x75, 0x72, 0x72, 0x6f, + 0x67, 0x61, 0x74, 0x65 + ]); +}); + +unitTest(function loneSurrogateEncodeInto2(): void { + const fixture = "\ud800"; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(3); + const result = encoder.encodeInto(fixture, bytes); + assertEquals(result.read, 1); + assertEquals(result.written, 3); + // deno-fmt-ignore + assertEquals(Array.from(bytes), [ + 0xef, 0xbf, 0xbd + ]); +}); + +unitTest(function loneSurrogateEncodeInto3(): void { + const fixture = "\udc00"; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(3); + const result = encoder.encodeInto(fixture, bytes); + assertEquals(result.read, 1); + assertEquals(result.written, 3); + // deno-fmt-ignore + assertEquals(Array.from(bytes), [ + 0xef, 0xbf, 0xbd + ]); +}); + +unitTest(function swappedSurrogatePairEncodeInto4(): void { + const fixture = "\udc00\ud800"; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(8); + const result = encoder.encodeInto(fixture, bytes); + assertEquals(result.read, 2); + assertEquals(result.written, 6); + // deno-fmt-ignore + assertEquals(Array.from(bytes), [ + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0x00, 0x00 + ]); +}); + unitTest(function textDecoderSharedUint8Array(): void { const ab = new SharedArrayBuffer(6); const dataView = new DataView(ab); diff --git a/extensions/web/08_text_encoding.js b/extensions/web/08_text_encoding.js index 7e7d4a573b..c293633c31 100644 --- a/extensions/web/08_text_encoding.js +++ b/extensions/web/08_text_encoding.js @@ -48,51 +48,129 @@ return inRange(a, 0x00, 0x7f); } - function stringToCodePoints(input) { - const u = []; - for (const c of input) { - u.push(c.codePointAt(0)); - } - return u; - } - - class UTF8Encoder { - handler(codePoint) { - if (codePoint === END_OF_STREAM) { - return "finished"; + // Minor Unicode reference for readers. + // + // Unicode code points are integers in the range 0x0 - 0x10ffff, (using at + // most 21 bits). These integers are what rendering engines use to decide what + // glyphs are displayed on the screen. Since most code points use less than + // 21-bits, there are encodings that can represent code points more + // efficiently. + // + // UTF-16 is one such encoding, and is used by Javascript engines to store + // strings internally. UTF-16 uses 1 or 2 16-bit integers (2 or 4 bytes) to + // represent a single code point. + // + // UTF-8 is another encoding, and uses 1, 2, 3 or 4 bytes to represent a + // single code point. + // + // The goal of the function below is to transform UTF-16 into UTF-8 without + // allocating any memory (writing to the buffer passed as parameter). The + // conversion loop is roughly divided into 3 steps: + // + // - Decode UTF-16 into Unicode. + // - Check if there's still enough space in the output buffer. If not, break + // out of the loop. + // - Encode UTF-8 into the output buffer. + // + // Some references to learn more about the topic: + // - https://dmitripavlutin.com/what-every-javascript-developer-should-know-about-unicode + // - https://en.wikipedia.org/wiki/UTF-8 + // - https://en.wikipedia.org/wiki/UTF-16 + function encodeUtf8(input, output, state) { + let { read, written } = state; + const inLen = input.length; + const outLen = output.length; + while (read < inLen) { + // Step 1: Decode the UTF-16 code unit(s) into an unicode code point. + // + // There are three possibilities here: + // - The code unit is outside the high surrogate range and is treated as + // the code point. + // - The code unit is in the high surrogate range and the next one + // is in the low surrogate range. The surrogate pair is combined into + // the final code point. + // - The code unit is a lone surrogate (high or low) which is invalid in + // UTF-16. In this case it is replaced by 0xfffd (� ) + const badCodePoint = 0xfffd; + const codeUnit = input.charCodeAt(read++); + const surrogateMask = codeUnit & 0xfc00; + let codePoint = codeUnit; + if (surrogateMask === 0xd800) { + // codeUnit is a high surrogate, check if there's a next character + if (read < inLen) { + // check if the next one is a low surrogate + const nextCodeUnit = input.charCodeAt(read); + if ((nextCodeUnit & 0xfc00) === 0xdc00) { + // low surrogate, advance input offset and compute code point + codePoint = 0x10000 + + ((codeUnit & 0x3ff) << 10) + (nextCodeUnit & 0x3ff); + read++; + } else { + // lone high surrogate + codePoint = badCodePoint; + } + } else { + // lone high surrogate + codePoint = badCodePoint; + } + } else if (surrogateMask === 0xdc00) { + // lone low surrogate + codePoint = badCodePoint; } - if (inRange(codePoint, 0x00, 0x7f)) { - return [codePoint]; + // Step 2: Check if there's available space to encode the code point as + // UTF-8. It will take at most 4 bytes, only need to check if the + // available space is lower than that. + const availableSpace = outLen - written; + if (availableSpace < 4) { + // Possibly not enough space, make the final decision based on the code + // point range. + if ( + availableSpace < 1 || + (availableSpace < 2 && codePoint >= 0x80) || + (availableSpace < 3 && codePoint >= 0x800) || + codePoint >= 0x10000 + ) { + // Not enough space. Rewind read offset and bail out + const isSurrogatePair = codePoint !== codeUnit && + codePoint !== badCodePoint; + read -= isSurrogatePair ? 2 : 1; + break; + } } - let count; - let offset; - if (inRange(codePoint, 0x0080, 0x07ff)) { - count = 1; - offset = 0xc0; - } else if (inRange(codePoint, 0x0800, 0xffff)) { - count = 2; - offset = 0xe0; - } else if (inRange(codePoint, 0x10000, 0x10ffff)) { - count = 3; - offset = 0xf0; + // Step 3: Encode the code point as UTF-8 into the output buffer. + if (codePoint < 0x80) { + // 7 bits, encoded in 1 byte directly (0xxxxxxx). + output[written++] = codePoint; + } else if (codePoint < 0x800) { + // 11 bits, encode in 2 bytes where: + // byte 1: 110xxxxx (5 bits) + // byte 2: 10xxxxxx (6 bits) + output[written++] = 0xc0 | (0x1f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); + } else if (codePoint < 0x10000) { + // 16 bits, encode in 3 bytes where: + // byte 1: 1110xxxx (4 bits) + // byte 2: 10xxxxxx (6 bits) + // byte 3: 10xxxxxx (6 bits) + output[written++] = 0xe0 | (0x0f & (codePoint >> 12)); + output[written++] = 0x80 | (0x3f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); } else { - throw TypeError( - `Code point out of range: \\x${codePoint.toString(16)}`, - ); + // 21 bits, encode in 4 bytes where: + // byte 1: 11110xxx (3 bits) + // byte 2: 10xxxxxx (6 bits) + // byte 3: 10xxxxxx (6 bits) + // byte 4: 10xxxxxx (6 bits) + output[written++] = 0xf0 | (0x07 & (codePoint >> 18)); + output[written++] = 0x80 | (0x3f & (codePoint >> 12)); + output[written++] = 0x80 | (0x3f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); } - - const bytes = [(codePoint >> (6 * count)) + offset]; - - while (count > 0) { - const temp = codePoint >> (6 * (count - 1)); - bytes.push(0x80 | (temp & 0x3f)); - count--; - } - - return bytes; } + state.read = read; + state.written = written; } function atob(s) { @@ -4221,37 +4299,12 @@ "2nd argument to TextEncoder.encodeInto must be Uint8Array", ); } + const state = { read: 0, written: 0 }; if (dest.byteLength === 0) { - return { read: 0, written: 0 }; + return state; } - const encoder = new UTF8Encoder(); - const inputStream = new Stream(stringToCodePoints(input)); - - let written = 0; - let read = 0; - while (true) { - const item = inputStream.read(); - const result = encoder.handler(item); - if (result === "finished") { - break; - } - if (dest.length - written >= result.length) { - read++; - if (item > 0xFFFF) { - // increment read a second time if greater than U+FFFF - read++; - } - dest.set(result, written); - written += result.length; - } else { - break; - } - } - - return { - read, - written, - }; + encodeUtf8(input, dest, state); + return state; } get [Symbol.toStringTag]() { return "TextEncoder"; diff --git a/tools/wpt/expectation.json b/tools/wpt/expectation.json index 407e301bbd..6eec573cb8 100644 --- a/tools/wpt/expectation.json +++ b/tools/wpt/expectation.json @@ -39,30 +39,6 @@ "api-replacement-encodings.any.js": true, "api-surrogates-utf8.any.js": true, "encodeInto.any.js": [ - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random", - "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler random", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler random", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler random", - "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler random", "encodeInto() and a detached output buffer" ], "idlharness.any.js": [