denoland-deno/ext/node/ops/buffer.rs

// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

use deno_core::anyhow::anyhow;
use deno_core::anyhow::Result;
use deno_core::op2;

#[op2(fast)]
pub fn op_is_ascii(#[buffer] buf: &[u8]) -> bool {
  buf.is_ascii()
}

#[op2(fast)]
pub fn op_is_utf8(#[buffer] buf: &[u8]) -> bool {
  std::str::from_utf8(buf).is_ok()
}

#[op2]
#[buffer]
pub fn op_transcode(
  #[buffer] source: &[u8],
  #[string] from_encoding: &str,
  #[string] to_encoding: &str,
) -> Result<Vec<u8>> {
  match (from_encoding, to_encoding) {
    ("utf8", "ascii") => Ok(utf8_to_ascii(source)),
    ("utf8", "latin1") => Ok(utf8_to_latin1(source)),
    ("utf8", "utf16le") => utf8_to_utf16le(source),
    ("utf16le", "utf8") => utf16le_to_utf8(source),
    ("latin1", "utf16le") | ("ascii", "utf16le") => {
      Ok(latin1_ascii_to_utf16le(source))
    }
    (from, to) => Err(anyhow!("Unable to transcode Buffer {from}->{to}")),
  }
}

fn latin1_ascii_to_utf16le(source: &[u8]) -> Vec<u8> {
  let mut result = Vec::with_capacity(source.len() * 2);
  for &byte in source {
    result.push(byte);
    result.push(0);
  }
  result
}

fn utf16le_to_utf8(source: &[u8]) -> Result<Vec<u8>> {
  let ucs2_vec: Vec<u16> = source
    .chunks(2)
    .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
    .collect();
  String::from_utf16(&ucs2_vec)
    .map(|utf8_string| utf8_string.into_bytes())
    .map_err(|e| anyhow!("Invalid UTF-16 sequence: {}", e))
}

fn utf8_to_utf16le(source: &[u8]) -> Result<Vec<u8>> {
  let utf8_string = std::str::from_utf8(source)?;
  let ucs2_vec: Vec<u16> = utf8_string.encode_utf16().collect();
  let bytes: Vec<u8> = ucs2_vec.iter().flat_map(|&x| x.to_le_bytes()).collect();
  Ok(bytes)
}

fn utf8_to_latin1(source: &[u8]) -> Vec<u8> {
  let mut latin1_bytes = Vec::with_capacity(source.len());
  let mut i = 0;
  while i < source.len() {
    match source[i] {
      byte if byte <= 0x7F => {
        // ASCII character
        latin1_bytes.push(byte);
        i += 1;
      }
      byte if (0xC2..=0xDF).contains(&byte) && i + 1 < source.len() => {
        // 2-byte UTF-8 sequence
        let codepoint =
          ((byte as u16 & 0x1F) << 6) | (source[i + 1] as u16 & 0x3F);
        latin1_bytes.push(if codepoint <= 0xFF {
          codepoint as u8
        } else {
          b'?'
        });
        i += 2;
      }
      _ => {
        // 3-byte or 4-byte UTF-8 sequence, or invalid UTF-8
        latin1_bytes.push(b'?');
        // Skip to the next valid UTF-8 start byte
        i += 1;
        while i < source.len() && (source[i] & 0xC0) == 0x80 {
          i += 1;
        }
      }
    }
  }
  latin1_bytes
}

fn utf8_to_ascii(source: &[u8]) -> Vec<u8> {
  let mut ascii_bytes = Vec::with_capacity(source.len());
  let mut i = 0;
  while i < source.len() {
    match source[i] {
      byte if byte <= 0x7F => {
        // ASCII character
        ascii_bytes.push(byte);
        i += 1;
      }
      _ => {
        // Non-ASCII character
        ascii_bytes.push(b'?');
        // Skip to the next valid UTF-8 start byte
        i += 1;
        while i < source.len() && (source[i] & 0xC0) == 0x80 {
          i += 1;
        }
      }
    }
  }
  ascii_bytes
}
feat(node): buffer isUtf8/isAscii (#23928) Fixes: https://github.com/denoland/deno/issues/23657 Implements `isUtf8` and `isAscii` as ops. 2024-05-21 18:50:59 -04:00			`// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.`

feat(ext/node): buffer.transcode() (#25972) Closes https://github.com/denoland/deno/issues/25911 2024-10-02 04:23:14 -04:00			`use deno_core::anyhow::anyhow;`
			`use deno_core::anyhow::Result;`
feat(node): buffer isUtf8/isAscii (#23928) Fixes: https://github.com/denoland/deno/issues/23657 Implements `isUtf8` and `isAscii` as ops. 2024-05-21 18:50:59 -04:00			`use deno_core::op2;`

			`#[op2(fast)]`
			`pub fn op_is_ascii(#[buffer] buf: &[u8]) -> bool {`
			`buf.is_ascii()`
			`}`

			`#[op2(fast)]`
			`pub fn op_is_utf8(#[buffer] buf: &[u8]) -> bool {`
			`std::str::from_utf8(buf).is_ok()`
			`}`
feat(ext/node): buffer.transcode() (#25972) Closes https://github.com/denoland/deno/issues/25911 2024-10-02 04:23:14 -04:00
			`#[op2]`
			`#[buffer]`
			`pub fn op_transcode(`
			`#[buffer] source: &[u8],`
			`#[string] from_encoding: &str,`
			`#[string] to_encoding: &str,`
			`) -> Result<Vec<u8>> {`
			`match (from_encoding, to_encoding) {`
			`("utf8", "ascii") => Ok(utf8_to_ascii(source)),`
			`("utf8", "latin1") => Ok(utf8_to_latin1(source)),`
			`("utf8", "utf16le") => utf8_to_utf16le(source),`
			`("utf16le", "utf8") => utf16le_to_utf8(source),`
			`("latin1", "utf16le") \| ("ascii", "utf16le") => {`
			`Ok(latin1_ascii_to_utf16le(source))`
			`}`
			`(from, to) => Err(anyhow!("Unable to transcode Buffer {from}->{to}")),`
			`}`
			`}`

			`fn latin1_ascii_to_utf16le(source: &[u8]) -> Vec<u8> {`
			`let mut result = Vec::with_capacity(source.len() * 2);`
			`for &byte in source {`
			`result.push(byte);`
			`result.push(0);`
			`}`
			`result`
			`}`

			`fn utf16le_to_utf8(source: &[u8]) -> Result<Vec<u8>> {`
			`let ucs2_vec: Vec<u16> = source`
			`.chunks(2)`
			`.map(\|chunk\| u16::from_le_bytes([chunk[0], chunk[1]]))`
			`.collect();`
			`String::from_utf16(&ucs2_vec)`
			`.map(\|utf8_string\| utf8_string.into_bytes())`
			`.map_err(\|e\| anyhow!("Invalid UTF-16 sequence: {}", e))`
			`}`

			`fn utf8_to_utf16le(source: &[u8]) -> Result<Vec<u8>> {`
			`let utf8_string = std::str::from_utf8(source)?;`
			`let ucs2_vec: Vec<u16> = utf8_string.encode_utf16().collect();`
			`let bytes: Vec<u8> = ucs2_vec.iter().flat_map(\|&x\| x.to_le_bytes()).collect();`
			`Ok(bytes)`
			`}`

			`fn utf8_to_latin1(source: &[u8]) -> Vec<u8> {`
			`let mut latin1_bytes = Vec::with_capacity(source.len());`
			`let mut i = 0;`
			`while i < source.len() {`
			`match source[i] {`
			`byte if byte <= 0x7F => {`
			`// ASCII character`
			`latin1_bytes.push(byte);`
			`i += 1;`
			`}`
			`byte if (0xC2..=0xDF).contains(&byte) && i + 1 < source.len() => {`
			`// 2-byte UTF-8 sequence`
			`let codepoint =`
			`((byte as u16 & 0x1F) << 6) \| (source[i + 1] as u16 & 0x3F);`
			`latin1_bytes.push(if codepoint <= 0xFF {`
			`codepoint as u8`
			`} else {`
			`b'?'`
			`});`
			`i += 2;`
			`}`
			`_ => {`
			`// 3-byte or 4-byte UTF-8 sequence, or invalid UTF-8`
			`latin1_bytes.push(b'?');`
			`// Skip to the next valid UTF-8 start byte`
			`i += 1;`
			`while i < source.len() && (source[i] & 0xC0) == 0x80 {`
			`i += 1;`
			`}`
			`}`
			`}`
			`}`
			`latin1_bytes`
			`}`

			`fn utf8_to_ascii(source: &[u8]) -> Vec<u8> {`
			`let mut ascii_bytes = Vec::with_capacity(source.len());`
			`let mut i = 0;`
			`while i < source.len() {`
			`match source[i] {`
			`byte if byte <= 0x7F => {`
			`// ASCII character`
			`ascii_bytes.push(byte);`
			`i += 1;`
			`}`
			`_ => {`
			`// Non-ASCII character`
			`ascii_bytes.push(b'?');`
			`// Skip to the next valid UTF-8 start byte`
			`i += 1;`
			`while i < source.len() && (source[i] & 0xC0) == 0x80 {`
			`i += 1;`
			`}`
			`}`
			`}`
			`}`
			`ascii_bytes`
			`}`