mirror of
https://github.com/denoland/deno.git
synced 2024-12-22 15:24:46 -05:00
fix: TextEncoder#encodeInto spec compliance + perf gains (#10129)
This commit is contained in:
parent
a051a7f7bc
commit
18a684ab1c
5 changed files with 215 additions and 91 deletions
|
@ -89,6 +89,11 @@ const EXEC_TIME_BENCHMARKS: &[(&str, &[&str], Option<i32>)] = &[
|
||||||
&["run", "cli/tests/text_encoder_perf.js"],
|
&["run", "cli/tests/text_encoder_perf.js"],
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"text_encoder_into",
|
||||||
|
&["run", "cli/tests/text_encoder_into_perf.js"],
|
||||||
|
None,
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"check",
|
"check",
|
||||||
&[
|
&[
|
||||||
|
|
34
cli/tests/text_encoder_into_perf.js
Normal file
34
cli/tests/text_encoder_into_perf.js
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
const mixed = "@Ā๐😀";
|
||||||
|
|
||||||
|
function generateRandom(bytes) {
|
||||||
|
let result = "";
|
||||||
|
let i = 0;
|
||||||
|
while (i < bytes) {
|
||||||
|
const toAdd = Math.floor(Math.random() * Math.min(4, bytes - i));
|
||||||
|
switch (toAdd) {
|
||||||
|
case 0:
|
||||||
|
result += mixed[0];
|
||||||
|
i++;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
result += mixed[1];
|
||||||
|
i++;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
result += mixed[2];
|
||||||
|
i++;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
result += mixed[3];
|
||||||
|
result += mixed[4];
|
||||||
|
i += 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const randomData = generateRandom(1024);
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
const targetBuffer = new Uint8Array(randomData.length * 4);
|
||||||
|
for (let i = 0; i < 10_000; i++) encoder.encodeInto(randomData, targetBuffer);
|
|
@ -157,6 +157,62 @@ unitTest(function textEncodeInto3(): void {
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
unitTest(function loneSurrogateEncodeInto(): void {
|
||||||
|
const fixture = "lone𝄞\ud888surrogate";
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
const bytes = new Uint8Array(20);
|
||||||
|
const result = encoder.encodeInto(fixture, bytes);
|
||||||
|
assertEquals(result.read, 16);
|
||||||
|
assertEquals(result.written, 20);
|
||||||
|
// deno-fmt-ignore
|
||||||
|
assertEquals(Array.from(bytes), [
|
||||||
|
0x6c, 0x6f, 0x6e, 0x65,
|
||||||
|
0xf0, 0x9d, 0x84, 0x9e,
|
||||||
|
0xef, 0xbf, 0xbd, 0x73,
|
||||||
|
0x75, 0x72, 0x72, 0x6f,
|
||||||
|
0x67, 0x61, 0x74, 0x65
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
unitTest(function loneSurrogateEncodeInto2(): void {
|
||||||
|
const fixture = "\ud800";
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
const bytes = new Uint8Array(3);
|
||||||
|
const result = encoder.encodeInto(fixture, bytes);
|
||||||
|
assertEquals(result.read, 1);
|
||||||
|
assertEquals(result.written, 3);
|
||||||
|
// deno-fmt-ignore
|
||||||
|
assertEquals(Array.from(bytes), [
|
||||||
|
0xef, 0xbf, 0xbd
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
unitTest(function loneSurrogateEncodeInto3(): void {
|
||||||
|
const fixture = "\udc00";
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
const bytes = new Uint8Array(3);
|
||||||
|
const result = encoder.encodeInto(fixture, bytes);
|
||||||
|
assertEquals(result.read, 1);
|
||||||
|
assertEquals(result.written, 3);
|
||||||
|
// deno-fmt-ignore
|
||||||
|
assertEquals(Array.from(bytes), [
|
||||||
|
0xef, 0xbf, 0xbd
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
unitTest(function swappedSurrogatePairEncodeInto4(): void {
|
||||||
|
const fixture = "\udc00\ud800";
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
const bytes = new Uint8Array(8);
|
||||||
|
const result = encoder.encodeInto(fixture, bytes);
|
||||||
|
assertEquals(result.read, 2);
|
||||||
|
assertEquals(result.written, 6);
|
||||||
|
// deno-fmt-ignore
|
||||||
|
assertEquals(Array.from(bytes), [
|
||||||
|
0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0x00, 0x00
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
unitTest(function textDecoderSharedUint8Array(): void {
|
unitTest(function textDecoderSharedUint8Array(): void {
|
||||||
const ab = new SharedArrayBuffer(6);
|
const ab = new SharedArrayBuffer(6);
|
||||||
const dataView = new DataView(ab);
|
const dataView = new DataView(ab);
|
||||||
|
|
|
@ -48,51 +48,129 @@
|
||||||
return inRange(a, 0x00, 0x7f);
|
return inRange(a, 0x00, 0x7f);
|
||||||
}
|
}
|
||||||
|
|
||||||
function stringToCodePoints(input) {
|
// Minor Unicode reference for readers.
|
||||||
const u = [];
|
//
|
||||||
for (const c of input) {
|
// Unicode code points are integers in the range 0x0 - 0x10ffff, (using at
|
||||||
u.push(c.codePointAt(0));
|
// most 21 bits). These integers are what rendering engines use to decide what
|
||||||
}
|
// glyphs are displayed on the screen. Since most code points use less than
|
||||||
return u;
|
// 21-bits, there are encodings that can represent code points more
|
||||||
}
|
// efficiently.
|
||||||
|
//
|
||||||
class UTF8Encoder {
|
// UTF-16 is one such encoding, and is used by Javascript engines to store
|
||||||
handler(codePoint) {
|
// strings internally. UTF-16 uses 1 or 2 16-bit integers (2 or 4 bytes) to
|
||||||
if (codePoint === END_OF_STREAM) {
|
// represent a single code point.
|
||||||
return "finished";
|
//
|
||||||
|
// UTF-8 is another encoding, and uses 1, 2, 3 or 4 bytes to represent a
|
||||||
|
// single code point.
|
||||||
|
//
|
||||||
|
// The goal of the function below is to transform UTF-16 into UTF-8 without
|
||||||
|
// allocating any memory (writing to the buffer passed as parameter). The
|
||||||
|
// conversion loop is roughly divided into 3 steps:
|
||||||
|
//
|
||||||
|
// - Decode UTF-16 into Unicode.
|
||||||
|
// - Check if there's still enough space in the output buffer. If not, break
|
||||||
|
// out of the loop.
|
||||||
|
// - Encode UTF-8 into the output buffer.
|
||||||
|
//
|
||||||
|
// Some references to learn more about the topic:
|
||||||
|
// - https://dmitripavlutin.com/what-every-javascript-developer-should-know-about-unicode
|
||||||
|
// - https://en.wikipedia.org/wiki/UTF-8
|
||||||
|
// - https://en.wikipedia.org/wiki/UTF-16
|
||||||
|
function encodeUtf8(input, output, state) {
|
||||||
|
let { read, written } = state;
|
||||||
|
const inLen = input.length;
|
||||||
|
const outLen = output.length;
|
||||||
|
while (read < inLen) {
|
||||||
|
// Step 1: Decode the UTF-16 code unit(s) into an unicode code point.
|
||||||
|
//
|
||||||
|
// There are three possibilities here:
|
||||||
|
// - The code unit is outside the high surrogate range and is treated as
|
||||||
|
// the code point.
|
||||||
|
// - The code unit is in the high surrogate range and the next one
|
||||||
|
// is in the low surrogate range. The surrogate pair is combined into
|
||||||
|
// the final code point.
|
||||||
|
// - The code unit is a lone surrogate (high or low) which is invalid in
|
||||||
|
// UTF-16. In this case it is replaced by 0xfffd (<28> )
|
||||||
|
const badCodePoint = 0xfffd;
|
||||||
|
const codeUnit = input.charCodeAt(read++);
|
||||||
|
const surrogateMask = codeUnit & 0xfc00;
|
||||||
|
let codePoint = codeUnit;
|
||||||
|
if (surrogateMask === 0xd800) {
|
||||||
|
// codeUnit is a high surrogate, check if there's a next character
|
||||||
|
if (read < inLen) {
|
||||||
|
// check if the next one is a low surrogate
|
||||||
|
const nextCodeUnit = input.charCodeAt(read);
|
||||||
|
if ((nextCodeUnit & 0xfc00) === 0xdc00) {
|
||||||
|
// low surrogate, advance input offset and compute code point
|
||||||
|
codePoint = 0x10000 +
|
||||||
|
((codeUnit & 0x3ff) << 10) + (nextCodeUnit & 0x3ff);
|
||||||
|
read++;
|
||||||
|
} else {
|
||||||
|
// lone high surrogate
|
||||||
|
codePoint = badCodePoint;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// lone high surrogate
|
||||||
|
codePoint = badCodePoint;
|
||||||
|
}
|
||||||
|
} else if (surrogateMask === 0xdc00) {
|
||||||
|
// lone low surrogate
|
||||||
|
codePoint = badCodePoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inRange(codePoint, 0x00, 0x7f)) {
|
// Step 2: Check if there's available space to encode the code point as
|
||||||
return [codePoint];
|
// UTF-8. It will take at most 4 bytes, only need to check if the
|
||||||
|
// available space is lower than that.
|
||||||
|
const availableSpace = outLen - written;
|
||||||
|
if (availableSpace < 4) {
|
||||||
|
// Possibly not enough space, make the final decision based on the code
|
||||||
|
// point range.
|
||||||
|
if (
|
||||||
|
availableSpace < 1 ||
|
||||||
|
(availableSpace < 2 && codePoint >= 0x80) ||
|
||||||
|
(availableSpace < 3 && codePoint >= 0x800) ||
|
||||||
|
codePoint >= 0x10000
|
||||||
|
) {
|
||||||
|
// Not enough space. Rewind read offset and bail out
|
||||||
|
const isSurrogatePair = codePoint !== codeUnit &&
|
||||||
|
codePoint !== badCodePoint;
|
||||||
|
read -= isSurrogatePair ? 2 : 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let count;
|
// Step 3: Encode the code point as UTF-8 into the output buffer.
|
||||||
let offset;
|
if (codePoint < 0x80) {
|
||||||
if (inRange(codePoint, 0x0080, 0x07ff)) {
|
// 7 bits, encoded in 1 byte directly (0xxxxxxx).
|
||||||
count = 1;
|
output[written++] = codePoint;
|
||||||
offset = 0xc0;
|
} else if (codePoint < 0x800) {
|
||||||
} else if (inRange(codePoint, 0x0800, 0xffff)) {
|
// 11 bits, encode in 2 bytes where:
|
||||||
count = 2;
|
// byte 1: 110xxxxx (5 bits)
|
||||||
offset = 0xe0;
|
// byte 2: 10xxxxxx (6 bits)
|
||||||
} else if (inRange(codePoint, 0x10000, 0x10ffff)) {
|
output[written++] = 0xc0 | (0x1f & (codePoint >> 6));
|
||||||
count = 3;
|
output[written++] = 0x80 | (0x3f & (codePoint));
|
||||||
offset = 0xf0;
|
} else if (codePoint < 0x10000) {
|
||||||
|
// 16 bits, encode in 3 bytes where:
|
||||||
|
// byte 1: 1110xxxx (4 bits)
|
||||||
|
// byte 2: 10xxxxxx (6 bits)
|
||||||
|
// byte 3: 10xxxxxx (6 bits)
|
||||||
|
output[written++] = 0xe0 | (0x0f & (codePoint >> 12));
|
||||||
|
output[written++] = 0x80 | (0x3f & (codePoint >> 6));
|
||||||
|
output[written++] = 0x80 | (0x3f & (codePoint));
|
||||||
} else {
|
} else {
|
||||||
throw TypeError(
|
// 21 bits, encode in 4 bytes where:
|
||||||
`Code point out of range: \\x${codePoint.toString(16)}`,
|
// byte 1: 11110xxx (3 bits)
|
||||||
);
|
// byte 2: 10xxxxxx (6 bits)
|
||||||
|
// byte 3: 10xxxxxx (6 bits)
|
||||||
|
// byte 4: 10xxxxxx (6 bits)
|
||||||
|
output[written++] = 0xf0 | (0x07 & (codePoint >> 18));
|
||||||
|
output[written++] = 0x80 | (0x3f & (codePoint >> 12));
|
||||||
|
output[written++] = 0x80 | (0x3f & (codePoint >> 6));
|
||||||
|
output[written++] = 0x80 | (0x3f & (codePoint));
|
||||||
}
|
}
|
||||||
|
|
||||||
const bytes = [(codePoint >> (6 * count)) + offset];
|
|
||||||
|
|
||||||
while (count > 0) {
|
|
||||||
const temp = codePoint >> (6 * (count - 1));
|
|
||||||
bytes.push(0x80 | (temp & 0x3f));
|
|
||||||
count--;
|
|
||||||
}
|
|
||||||
|
|
||||||
return bytes;
|
|
||||||
}
|
}
|
||||||
|
state.read = read;
|
||||||
|
state.written = written;
|
||||||
}
|
}
|
||||||
|
|
||||||
function atob(s) {
|
function atob(s) {
|
||||||
|
@ -4221,37 +4299,12 @@
|
||||||
"2nd argument to TextEncoder.encodeInto must be Uint8Array",
|
"2nd argument to TextEncoder.encodeInto must be Uint8Array",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
const state = { read: 0, written: 0 };
|
||||||
if (dest.byteLength === 0) {
|
if (dest.byteLength === 0) {
|
||||||
return { read: 0, written: 0 };
|
return state;
|
||||||
}
|
}
|
||||||
const encoder = new UTF8Encoder();
|
encodeUtf8(input, dest, state);
|
||||||
const inputStream = new Stream(stringToCodePoints(input));
|
return state;
|
||||||
|
|
||||||
let written = 0;
|
|
||||||
let read = 0;
|
|
||||||
while (true) {
|
|
||||||
const item = inputStream.read();
|
|
||||||
const result = encoder.handler(item);
|
|
||||||
if (result === "finished") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (dest.length - written >= result.length) {
|
|
||||||
read++;
|
|
||||||
if (item > 0xFFFF) {
|
|
||||||
// increment read a second time if greater than U+FFFF
|
|
||||||
read++;
|
|
||||||
}
|
|
||||||
dest.set(result, written);
|
|
||||||
written += result.length;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
read,
|
|
||||||
written,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
get [Symbol.toStringTag]() {
|
get [Symbol.toStringTag]() {
|
||||||
return "TextEncoder";
|
return "TextEncoder";
|
||||||
|
|
|
@ -39,30 +39,6 @@
|
||||||
"api-replacement-encodings.any.js": true,
|
"api-replacement-encodings.any.js": true,
|
||||||
"api-surrogates-utf8.any.js": true,
|
"api-surrogates-utf8.any.js": true,
|
||||||
"encodeInto.any.js": [
|
"encodeInto.any.js": [
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
|
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
|
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
|
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
|
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
|
|
||||||
"encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
|
|
||||||
"encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
|
|
||||||
"encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
|
|
||||||
"encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
|
|
||||||
"encodeInto() and a detached output buffer"
|
"encodeInto() and a detached output buffer"
|
||||||
],
|
],
|
||||||
"idlharness.any.js": [
|
"idlharness.any.js": [
|
||||||
|
|
Loading…
Reference in a new issue