From 32c12787361b65bbc55a7b9c1fe43689cb0a8b98 Mon Sep 17 00:00:00 2001 From: Satya Rohith Date: Wed, 2 Oct 2024 13:53:14 +0530 Subject: [PATCH] feat(ext/node): buffer.transcode() (#25972) Closes https://github.com/denoland/deno/issues/25911 --- ext/node/lib.rs | 1 + ext/node/ops/buffer.rs | 106 ++++++++++++++++++ ext/node/polyfills/buffer.ts | 1 + ext/node/polyfills/internal/buffer.mjs | 51 ++++++++- tests/node_compat/config.jsonc | 1 + tests/node_compat/runner/TODO.md | 1 - .../test/parallel/test-icu-transcode.js | 97 ++++++++++++++++ 7 files changed, 255 insertions(+), 3 deletions(-) create mode 100644 tests/node_compat/test/parallel/test-icu-transcode.js diff --git a/ext/node/lib.rs b/ext/node/lib.rs index 0c821ecf8b..d23c072042 100644 --- a/ext/node/lib.rs +++ b/ext/node/lib.rs @@ -167,6 +167,7 @@ deno_core::extension!(deno_node, ops::buffer::op_is_ascii, ops::buffer::op_is_utf8, + ops::buffer::op_transcode, ops::crypto::op_node_check_prime_async, ops::crypto::op_node_check_prime_bytes_async, ops::crypto::op_node_check_prime_bytes, diff --git a/ext/node/ops/buffer.rs b/ext/node/ops/buffer.rs index 74a011ab80..01f878ec15 100644 --- a/ext/node/ops/buffer.rs +++ b/ext/node/ops/buffer.rs @@ -1,5 +1,7 @@ // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. +use deno_core::anyhow::anyhow; +use deno_core::anyhow::Result; use deno_core::op2; #[op2(fast)] @@ -11,3 +13,107 @@ pub fn op_is_ascii(#[buffer] buf: &[u8]) -> bool { pub fn op_is_utf8(#[buffer] buf: &[u8]) -> bool { std::str::from_utf8(buf).is_ok() } + +#[op2] +#[buffer] +pub fn op_transcode( + #[buffer] source: &[u8], + #[string] from_encoding: &str, + #[string] to_encoding: &str, +) -> Result> { + match (from_encoding, to_encoding) { + ("utf8", "ascii") => Ok(utf8_to_ascii(source)), + ("utf8", "latin1") => Ok(utf8_to_latin1(source)), + ("utf8", "utf16le") => utf8_to_utf16le(source), + ("utf16le", "utf8") => utf16le_to_utf8(source), + ("latin1", "utf16le") | ("ascii", "utf16le") => { + Ok(latin1_ascii_to_utf16le(source)) + } + (from, to) => Err(anyhow!("Unable to transcode Buffer {from}->{to}")), + } +} + +fn latin1_ascii_to_utf16le(source: &[u8]) -> Vec { + let mut result = Vec::with_capacity(source.len() * 2); + for &byte in source { + result.push(byte); + result.push(0); + } + result +} + +fn utf16le_to_utf8(source: &[u8]) -> Result> { + let ucs2_vec: Vec = source + .chunks(2) + .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]])) + .collect(); + String::from_utf16(&ucs2_vec) + .map(|utf8_string| utf8_string.into_bytes()) + .map_err(|e| anyhow!("Invalid UTF-16 sequence: {}", e)) +} + +fn utf8_to_utf16le(source: &[u8]) -> Result> { + let utf8_string = std::str::from_utf8(source)?; + let ucs2_vec: Vec = utf8_string.encode_utf16().collect(); + let bytes: Vec = ucs2_vec.iter().flat_map(|&x| x.to_le_bytes()).collect(); + Ok(bytes) +} + +fn utf8_to_latin1(source: &[u8]) -> Vec { + let mut latin1_bytes = Vec::with_capacity(source.len()); + let mut i = 0; + while i < source.len() { + match source[i] { + byte if byte <= 0x7F => { + // ASCII character + latin1_bytes.push(byte); + i += 1; + } + byte if (0xC2..=0xDF).contains(&byte) && i + 1 < source.len() => { + // 2-byte UTF-8 sequence + let codepoint = + ((byte as u16 & 0x1F) << 6) | (source[i + 1] as u16 & 0x3F); + latin1_bytes.push(if codepoint <= 0xFF { + codepoint as u8 + } else { + b'?' + }); + i += 2; + } + _ => { + // 3-byte or 4-byte UTF-8 sequence, or invalid UTF-8 + latin1_bytes.push(b'?'); + // Skip to the next valid UTF-8 start byte + i += 1; + while i < source.len() && (source[i] & 0xC0) == 0x80 { + i += 1; + } + } + } + } + latin1_bytes +} + +fn utf8_to_ascii(source: &[u8]) -> Vec { + let mut ascii_bytes = Vec::with_capacity(source.len()); + let mut i = 0; + while i < source.len() { + match source[i] { + byte if byte <= 0x7F => { + // ASCII character + ascii_bytes.push(byte); + i += 1; + } + _ => { + // Non-ASCII character + ascii_bytes.push(b'?'); + // Skip to the next valid UTF-8 start byte + i += 1; + while i < source.len() && (source[i] & 0xC0) == 0x80 { + i += 1; + } + } + } + } + ascii_bytes +} diff --git a/ext/node/polyfills/buffer.ts b/ext/node/polyfills/buffer.ts index 8986cf53d7..efe3b07a97 100644 --- a/ext/node/polyfills/buffer.ts +++ b/ext/node/polyfills/buffer.ts @@ -13,4 +13,5 @@ export { kMaxLength, kStringMaxLength, SlowBuffer, + transcode, } from "ext:deno_node/internal/buffer.mjs"; diff --git a/ext/node/polyfills/internal/buffer.mjs b/ext/node/polyfills/internal/buffer.mjs index 6e43a49031..6687f73941 100644 --- a/ext/node/polyfills/internal/buffer.mjs +++ b/ext/node/polyfills/internal/buffer.mjs @@ -6,7 +6,7 @@ // deno-lint-ignore-file prefer-primordials import { core } from "ext:core/mod.js"; -import { op_is_ascii, op_is_utf8 } from "ext:core/ops"; +import { op_is_ascii, op_is_utf8, op_transcode } from "ext:core/ops"; import { TextDecoder, TextEncoder } from "ext:deno_web/08_text_encoding.js"; import { codes } from "ext:deno_node/internal/error_codes.ts"; @@ -32,7 +32,11 @@ import { import { normalizeEncoding } from "ext:deno_node/internal/util.mjs"; import { validateBuffer } from "ext:deno_node/internal/validators.mjs"; import { isUint8Array } from "ext:deno_node/internal/util/types.ts"; -import { ERR_INVALID_STATE, NodeError } from "ext:deno_node/internal/errors.ts"; +import { + ERR_INVALID_STATE, + genericNodeError, + NodeError, +} from "ext:deno_node/internal/errors.ts"; import { forgivingBase64Encode, forgivingBase64UrlEncode, @@ -2598,6 +2602,48 @@ export function isAscii(input) { ], input); } +export function transcode(source, fromEnco, toEnco) { + if (!isUint8Array(source)) { + throw new codes.ERR_INVALID_ARG_TYPE( + "source", + ["Buffer", "Uint8Array"], + source, + ); + } + if (source.length === 0) { + return Buffer.alloc(0); + } + const code = "U_ILLEGAL_ARGUMENT_ERROR"; + const illegalArgumentError = genericNodeError( + `Unable to transcode Buffer [${code}]`, + { code: code, errno: 1 }, + ); + fromEnco = normalizeEncoding(fromEnco); + toEnco = normalizeEncoding(toEnco); + if (!fromEnco || !toEnco) { + throw illegalArgumentError; + } + // Return the provided source when transcode is not required + // for the from/to encoding pair. + const returnSource = fromEnco === toEnco || + fromEnco === "ascii" && toEnco === "utf8" || + fromEnco === "ascii" && toEnco === "latin1"; + if (returnSource) { + return Buffer.from(source); + } + + try { + const result = op_transcode(new Uint8Array(source), fromEnco, toEnco); + return Buffer.from(result, toEnco); + } catch (err) { + if (err.message.includes("Unable to transcode Buffer")) { + throw illegalArgumentError; + } else { + throw err; + } + } +} + export default { atob, btoa, @@ -2610,4 +2656,5 @@ export default { kMaxLength, kStringMaxLength, SlowBuffer, + transcode, }; diff --git a/tests/node_compat/config.jsonc b/tests/node_compat/config.jsonc index 2f94fa2f2e..bc9bf476be 100644 --- a/tests/node_compat/config.jsonc +++ b/tests/node_compat/config.jsonc @@ -406,6 +406,7 @@ "test-http-outgoing-settimeout.js", "test-http-url.parse-https.request.js", "test-http-url.parse-only-support-http-https-protocol.js", + "test-icu-transcode.js", "test-net-access-byteswritten.js", "test-net-better-error-messages-listen-path.js", "test-net-better-error-messages-path.js", diff --git a/tests/node_compat/runner/TODO.md b/tests/node_compat/runner/TODO.md index 24e8271828..99258f5a56 100644 --- a/tests/node_compat/runner/TODO.md +++ b/tests/node_compat/runner/TODO.md @@ -1632,7 +1632,6 @@ NOTE: This file should not be manually edited. Please edit `tests/node_compat/co - [parallel/test-icu-minimum-version.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-minimum-version.js) - [parallel/test-icu-punycode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-punycode.js) - [parallel/test-icu-stringwidth.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-stringwidth.js) -- [parallel/test-icu-transcode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-transcode.js) - [parallel/test-inspect-address-in-use.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-address-in-use.js) - [parallel/test-inspect-async-hook-setup-at-inspect.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-async-hook-setup-at-inspect.js) - [parallel/test-inspect-publish-uid.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-publish-uid.js) diff --git a/tests/node_compat/test/parallel/test-icu-transcode.js b/tests/node_compat/test/parallel/test-icu-transcode.js new file mode 100644 index 0000000000..1f5aeb5355 --- /dev/null +++ b/tests/node_compat/test/parallel/test-icu-transcode.js @@ -0,0 +1,97 @@ +// deno-fmt-ignore-file +// deno-lint-ignore-file + +// Copyright Joyent and Node contributors. All rights reserved. MIT license. +// Taken from Node 18.12.1 +// This file is automatically generated by `tests/node_compat/runner/setup.ts`. Do not modify this file manually. + +'use strict'; + +const common = require('../common'); + +if (!common.hasIntl) + common.skip('missing Intl'); + +const buffer = require('buffer'); +const assert = require('assert'); +const orig = Buffer.from('těst ☕', 'utf8'); + +// Test Transcoding +const tests = { + 'latin1': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f], + 'ascii': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f], + 'ucs2': [0x74, 0x00, 0x1b, 0x01, 0x73, + 0x00, 0x74, 0x00, 0x20, 0x00, + 0x15, 0x26] +}; + +for (const test in tests) { + const dest = buffer.transcode(orig, 'utf8', test); + assert.strictEqual(dest.length, tests[test].length, `utf8->${test} length`); + for (let n = 0; n < tests[test].length; n++) + assert.strictEqual(dest[n], tests[test][n], `utf8->${test} char ${n}`); +} + +{ + const dest = buffer.transcode(Buffer.from(tests.ucs2), 'ucs2', 'utf8'); + assert.strictEqual(dest.toString(), orig.toString()); +} + +{ + const utf8 = Buffer.from('€'.repeat(4000), 'utf8'); + const ucs2 = Buffer.from('€'.repeat(4000), 'ucs2'); + const utf8_to_ucs2 = buffer.transcode(utf8, 'utf8', 'ucs2'); + const ucs2_to_utf8 = buffer.transcode(ucs2, 'ucs2', 'utf8'); + assert.deepStrictEqual(utf8, ucs2_to_utf8); + assert.deepStrictEqual(ucs2, utf8_to_ucs2); + assert.strictEqual(ucs2_to_utf8.toString('utf8'), + utf8_to_ucs2.toString('ucs2')); +} + +assert.throws( + () => buffer.transcode(null, 'utf8', 'ascii'), + { + name: 'TypeError', + code: 'ERR_INVALID_ARG_TYPE', + message: 'The "source" argument must be an instance of Buffer ' + + 'or Uint8Array. Received null' + } +); + +assert.throws( + () => buffer.transcode(Buffer.from('a'), 'b', 'utf8'), + /^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]/ +); + +assert.throws( + () => buffer.transcode(Buffer.from('a'), 'uf8', 'b'), + /^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]$/ +); + +assert.deepStrictEqual( + buffer.transcode(Buffer.from('hi', 'ascii'), 'ascii', 'utf16le'), + Buffer.from('hi', 'utf16le')); +assert.deepStrictEqual( + buffer.transcode(Buffer.from('hi', 'latin1'), 'latin1', 'utf16le'), + Buffer.from('hi', 'utf16le')); +assert.deepStrictEqual( + buffer.transcode(Buffer.from('hä', 'latin1'), 'latin1', 'utf16le'), + Buffer.from('hä', 'utf16le')); + +// Test that Uint8Array arguments are okay. +{ + const uint8array = new Uint8Array([...Buffer.from('hä', 'latin1')]); + assert.deepStrictEqual( + buffer.transcode(uint8array, 'latin1', 'utf16le'), + Buffer.from('hä', 'utf16le')); +} + +{ + const dest = buffer.transcode(new Uint8Array(), 'utf8', 'latin1'); + assert.strictEqual(dest.length, 0); +} + +// Test that it doesn't crash +{ + buffer.transcode(new buffer.SlowBuffer(1), 'utf16le', 'ucs2'); +}