From ac7fadce6f5f6a8d6d31524e1f6b1ed0b85b1fc0 Mon Sep 17 00:00:00 2001
From: Luca Casonato <hello@lcas.dev>
Date: Mon, 28 Oct 2024 15:32:17 +0100
Subject: [PATCH] add method to ValueView

---
 src/string.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)
diff --git a/src/string.rs b/src/string.rs
index 6ee1183e..4b60960f 100644
--- a/src/string.rs
+++ b/src/string.rs
@@ -778,7 +778,8 @@ impl String {
     }
   }
 
-  /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
+  /// Converts a [`crate::String`] to either an owned [`std::string::String`],
+  /// or a borrowed [`str`], depending on whether it fits into the
   /// provided buffer.
   pub fn to_rust_cow_lossy<'a, const N: usize>(
     &self,
@@ -789,7 +790,9 @@ impl String {
     let string = unsafe { Local::from_raw(self).unwrap_unchecked() };
     let view = ValueView::new(scope, string);
     match view.data() {
-      ValueViewData::OneByte(bytes) => latin1_to_cow_str(bytes, buffer),
+      ValueViewData::OneByte(bytes) => {
+        latin1_to_cow_str_always_copy(bytes, buffer)
+      }
       ValueViewData::TwoByte(code_points) => {
         wtf16_to_cow_str(code_points, buffer)
       }
@@ -861,6 +864,44 @@ fn wtf16_to_string(code_points: &[u16]) -> std::string::String {
 
 #[inline(always)]
 fn latin1_to_cow_str<'a, const N: usize>(
+  bytes: &'a [u8],
+  buffer: &'a mut [MaybeUninit<u8>; N],
+) -> Cow<'a, str> {
+  if bytes.is_ascii() {
+    // SAFETY: The string is ASCII, so it's valid UTF-8.
+    Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(bytes) })
+  } else if bytes.len() * 2 < N {
+    // SAFETY: The string is Latin1 - we need to convert to UTF-8. But it
+    // is short enough to fit into the buffer, because the buffer is at
+    // least twice as large as the string and any non-ASCII one-byte
+    // character will be encoded as exactly two bytes in UTF-8.
+    let written = unsafe {
+      latin1_to_utf8(
+        bytes.len(),
+        bytes.as_ptr(),
+        buffer.as_mut_ptr() as *mut u8,
+      )
+    };
+    debug_assert!(written <= buffer.len());
+
+    // SAFETY: The buffer is filled with valid UTF-8 data.
+    let str = unsafe {
+      std::str::from_utf8_unchecked(std::slice::from_raw_parts(
+        buffer.as_ptr() as *const u8,
+        written,
+      ))
+    };
+    Cow::Borrowed(str)
+  } else {
+    // TODO: this could likely be optimized for large strings by using SIMD to
+    // calculate the length of the resulting string and then allocating once,
+    // and then converting the string using SIMD.
+    Cow::Owned(std::string::String::from_utf8_lossy(bytes).into_owned())
+  }
+}
+
+#[inline(always)]
+fn latin1_to_cow_str_always_copy<'a, const N: usize>(
   bytes: &[u8],
   buffer: &'a mut [MaybeUninit<u8>; N],
 ) -> Cow<'a, str> {
@@ -1145,3 +1186,35 @@ impl<'s> Drop for ValueView<'s> {
     unsafe { v8__String__ValueView__DESTRUCT(self) }
   }
 }
+
+impl ValueView<'_> {
+  /// Creates a copy of a [`ValueView`] in a [`std::string::String`].
+  /// Convenience function not present in the original V8 API.
+  pub fn to_rust_string_lossy(&self) -> std::string::String {
+    match self.data() {
+      ValueViewData::OneByte(bytes) => latin1_to_string(bytes),
+      ValueViewData::TwoByte(code_points) => wtf16_to_string(code_points),
+    }
+  }
+
+  /// Converts a [`ValueView`] to either an owned [`std::string::String`],
+  /// or a borrowed [`str`].
+  ///
+  /// If the [`ValueView`] is an ASCII one-byte string, a reference to the
+  /// string is returned and no copies are performed. If the string is not
+  /// ASCII, but fits into the provided buffer, it is copied into the buffer
+  /// and a reference to the buffer is returned. If the string does not fit
+  /// into the buffer, it is copied into a newly allocated
+  /// [`std::string::String`] and returned.
+  pub fn to_rust_cow_lossy<'a, const N: usize>(
+    &'a self,
+    buffer: &'a mut [MaybeUninit<u8>; N],
+  ) -> Cow<'a, str> {
+    match self.data() {
+      ValueViewData::OneByte(bytes) => latin1_to_cow_str(bytes, buffer),
+      ValueViewData::TwoByte(code_points) => {
+        wtf16_to_cow_str(code_points, buffer)
+      }
+    }
+  }
+}