denoland-deno/cli/text_encoding.rs

// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.

use encoding_rs::*;
use std::{
  borrow::Cow,
  io::{Error, ErrorKind},
};

pub const BOM_CHAR: char = '\u{FEFF}';

/// Attempts to detect the character encoding of the provided bytes.
///
/// Supports UTF-8, UTF-16 Little Endian and UTF-16 Big Endian.
pub fn detect_charset(bytes: &'_ [u8]) -> &'static str {
  const UTF16_LE_BOM: &[u8] = b"\xFF\xFE";
  const UTF16_BE_BOM: &[u8] = b"\xFE\xFF";

  if bytes.starts_with(UTF16_LE_BOM) {
    "utf-16le"
  } else if bytes.starts_with(UTF16_BE_BOM) {
    "utf-16be"
  } else {
    // Assume everything else is utf-8
    "utf-8"
  }
}

/// Attempts to convert the provided bytes to a UTF-8 string.
///
/// Supports all encodings supported by the encoding_rs crate, which includes
/// all encodings specified in the WHATWG Encoding Standard, and only those
/// encodings (see: <https://encoding.spec.whatwg.org/>).
pub fn convert_to_utf8<'a>(
  bytes: &'a [u8],
  charset: &'_ str,
) -> Result<Cow<'a, str>, Error> {
  match Encoding::for_label(charset.as_bytes()) {
    Some(encoding) => encoding
      .decode_without_bom_handling_and_without_replacement(bytes)
      .ok_or_else(|| ErrorKind::InvalidData.into()),
    None => Err(Error::new(
      ErrorKind::InvalidInput,
      format!("Unsupported charset: {}", charset),
    )),
  }
}

/// Strips the byte order mark from the provided text if it exists.
pub fn strip_bom(text: &str) -> &str {
  if text.starts_with(BOM_CHAR) {
    &text[BOM_CHAR.len_utf8()..]
  } else {
    text
  }
}

#[cfg(test)]
mod tests {
  use super::*;

  fn test_detection(test_data: &[u8], expected_charset: &str) {
    let detected_charset = detect_charset(test_data);
    assert_eq!(
      expected_charset.to_lowercase(),
      detected_charset.to_lowercase()
    );
  }

  #[test]
  fn test_detection_utf8_no_bom() {
    let test_data = "Hello UTF-8 it is \u{23F0} for Deno!"
      .to_owned()
      .into_bytes();
    test_detection(&test_data, "utf-8");
  }

  #[test]
  fn test_detection_utf16_little_endian() {
    let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec();
    test_detection(&test_data, "utf-16le");
  }

  #[test]
  fn test_detection_utf16_big_endian() {
    let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec();
    test_detection(&test_data, "utf-16be");
  }

  #[test]
  fn test_decoding_unsupported_charset() {
    let test_data = Vec::new();
    let result = convert_to_utf8(&test_data, "utf-32le");
    assert!(result.is_err());
    let err = result.expect_err("Err expected");
    assert!(err.kind() == ErrorKind::InvalidInput);
  }

  #[test]
  fn test_decoding_invalid_utf8() {
    let test_data = b"\xFE\xFE\xFF\xFF".to_vec();
    let result = convert_to_utf8(&test_data, "utf-8");
    assert!(result.is_err());
    let err = result.expect_err("Err expected");
    assert!(err.kind() == ErrorKind::InvalidData);
  }
}
chore: update copyright to 2022 (#13306) Co-authored-by: Erfan Safari <erfanshield@outlook.com> 2022-01-07 22:09:52 -05:00			`// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.`
Move JSON ops to deno_core (#7336) 2020-09-05 20:34:02 -04:00
fix(cli): add support for non-UTF8 source files (#6789) Fixes: #5542 2020-08-03 17:39:48 -04:00			`use encoding_rs::*;`
			`use std::{`
			`borrow::Cow,`
			`io::{Error, ErrorKind},`
			`};`

fix: parse error when transpiling code with BOM (#11688) Co-authored-by: David Sherret <dsherret@gmail.com> 2021-08-16 03:28:29 -04:00			`pub const BOM_CHAR: char = '\u{FEFF}';`

fix(cli): add support for non-UTF8 source files (#6789) Fixes: #5542 2020-08-03 17:39:48 -04:00			`/// Attempts to detect the character encoding of the provided bytes.`
			`///`
			`/// Supports UTF-8, UTF-16 Little Endian and UTF-16 Big Endian.`
			`pub fn detect_charset(bytes: &'_ [u8]) -> &'static str {`
			`const UTF16_LE_BOM: &[u8] = b"\xFF\xFE";`
			`const UTF16_BE_BOM: &[u8] = b"\xFE\xFF";`

			`if bytes.starts_with(UTF16_LE_BOM) {`
			`"utf-16le"`
			`} else if bytes.starts_with(UTF16_BE_BOM) {`
			`"utf-16be"`
			`} else {`
			`// Assume everything else is utf-8`
			`"utf-8"`
			`}`
			`}`

			`/// Attempts to convert the provided bytes to a UTF-8 string.`
			`///`
			`/// Supports all encodings supported by the encoding_rs crate, which includes`
			`/// all encodings specified in the WHATWG Encoding Standard, and only those`
fix(doc): fix rustdoc bare_urls warning (#11921) 2021-09-05 10:22:45 -04:00			`/// encodings (see: <https://encoding.spec.whatwg.org/>).`
fix(cli): add support for non-UTF8 source files (#6789) Fixes: #5542 2020-08-03 17:39:48 -04:00			`pub fn convert_to_utf8<'a>(`
			`bytes: &'a [u8],`
			`charset: &'_ str,`
			`) -> Result<Cow<'a, str>, Error> {`
			`match Encoding::for_label(charset.as_bytes()) {`
			`Some(encoding) => encoding`
			`.decode_without_bom_handling_and_without_replacement(bytes)`
			`.ok_or_else(\|\| ErrorKind::InvalidData.into()),`
			`None => Err(Error::new(`
			`ErrorKind::InvalidInput,`
			`format!("Unsupported charset: {}", charset),`
			`)),`
			`}`
			`}`

fix: parse error when transpiling code with BOM (#11688) Co-authored-by: David Sherret <dsherret@gmail.com> 2021-08-16 03:28:29 -04:00			`/// Strips the byte order mark from the provided text if it exists.`
			`pub fn strip_bom(text: &str) -> &str {`
			`if text.starts_with(BOM_CHAR) {`
			`&text[BOM_CHAR.len_utf8()..]`
			`} else {`
			`text`
			`}`
			`}`

fix(cli): add support for non-UTF8 source files (#6789) Fixes: #5542 2020-08-03 17:39:48 -04:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`fn test_detection(test_data: &[u8], expected_charset: &str) {`
			`let detected_charset = detect_charset(test_data);`
			`assert_eq!(`
			`expected_charset.to_lowercase(),`
			`detected_charset.to_lowercase()`
			`);`
			`}`

			`#[test]`
			`fn test_detection_utf8_no_bom() {`
			`let test_data = "Hello UTF-8 it is \u{23F0} for Deno!"`
			`.to_owned()`
			`.into_bytes();`
			`test_detection(&test_data, "utf-8");`
			`}`

			`#[test]`
			`fn test_detection_utf16_little_endian() {`
			`let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec();`
			`test_detection(&test_data, "utf-16le");`
			`}`

			`#[test]`
			`fn test_detection_utf16_big_endian() {`
			`let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec();`
			`test_detection(&test_data, "utf-16be");`
			`}`

			`#[test]`
			`fn test_decoding_unsupported_charset() {`
			`let test_data = Vec::new();`
			`let result = convert_to_utf8(&test_data, "utf-32le");`
			`assert!(result.is_err());`
			`let err = result.expect_err("Err expected");`
			`assert!(err.kind() == ErrorKind::InvalidInput);`
			`}`

			`#[test]`
			`fn test_decoding_invalid_utf8() {`
			`let test_data = b"\xFE\xFE\xFF\xFF".to_vec();`
			`let result = convert_to_utf8(&test_data, "utf-8");`
			`assert!(result.is_err());`
			`let err = result.expect_err("Err expected");`
			`assert!(err.kind() == ErrorKind::InvalidData);`
			`}`
			`}`