From d706291c5db10884ce7a309eee41e07637cd0cf3 Mon Sep 17 00:00:00 2001
From: Matt Mastracci <matthew@mastracci.com>
Date: Fri, 30 Jun 2023 09:46:29 -0600
Subject: [PATCH] fix: Ensure that one-byte strings that are not ASCII go
 through write_utf8_uninit (#1261)

---
 src/string.rs     | 39 +++++++++++++++++++++------------------
 tests/test_api.rs | 15 +++++++++++++++
 2 files changed, 36 insertions(+), 18 deletions(-)
diff --git a/src/string.rs b/src/string.rs
index 8c6efbf0..772062eb 100644
--- a/src/string.rs
+++ b/src/string.rs
@@ -422,13 +422,16 @@ impl String {
 
   /// Creates a copy of a [`crate::String`] in a [`std::string::String`].
   /// Convenience function not present in the original V8 API.
-  #[inline(always)]
   pub fn to_rust_string_lossy(
     &self,
     scope: &mut Isolate,
   ) -> std::string::String {
-    if self.is_onebyte() {
-      let len_utf16 = self.length();
+    let len_utf8 = self.utf8_length(scope);
+    let len_utf16 = self.length();
+
+    // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
+    // string is 100% 7-bit ASCII.
+    if self.is_onebyte() && len_utf8 == len_utf16 {
       unsafe {
         // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
         // accidentally creating a slice of u8 which would be invalid.
@@ -452,15 +455,14 @@ impl String {
       }
     }
 
-    let capacity = self.utf8_length(scope);
     // SAFETY: This allocates a buffer manually using the default allocator using the string's capacity.
     // We have a large number of invariants to uphold, so please check changes to this code carefully
     unsafe {
       // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
       // accidentally creating a slice of u8 which would be invalid.
-      let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap();
+      let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
       let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
-      let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity);
+      let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);
 
       // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
       let length = self.write_utf8_uninit(
@@ -469,26 +471,28 @@ impl String {
         None,
         WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
       );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);
 
       // Return an owned string from this guaranteed now-initialized data
       let buffer = data as *mut u8;
-      std::string::String::from_raw_parts(buffer, length, capacity)
+      std::string::String::from_raw_parts(buffer, length, len_utf8)
     }
   }
 
   /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
   /// provided buffer.
-  #[inline(always)]
   pub fn to_rust_cow_lossy<'a, const N: usize>(
     &self,
     scope: &mut Isolate,
     buffer: &'a mut [MaybeUninit<u8>; N],
   ) -> Cow<'a, str> {
     // TODO(mmastrac): Ideally we should be able to access the string's internal representation
-
+    let len_utf8 = self.utf8_length(scope);
     let len_utf16 = self.length();
-    if self.is_onebyte() {
+
+    // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
+    // string is 100% 7-bit ASCII.
+    if self.is_onebyte() && len_utf8 == len_utf16 {
       if len_utf16 <= N {
         let length = self.write_one_byte_uninit(
           scope,
@@ -532,8 +536,7 @@ impl String {
       }
     }
 
-    let capacity = self.utf8_length(scope);
-    if capacity <= N {
+    if len_utf8 <= N {
       // No malloc path
       let length = self.write_utf8_uninit(
         scope,
@@ -541,7 +544,7 @@ impl String {
         None,
         WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
       );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);
 
       // SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information.
       unsafe {
@@ -559,9 +562,9 @@ impl String {
     unsafe {
       // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
       // accidentally creating a slice of u8 which would be invalid.
-      let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap();
+      let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
       let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
-      let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity);
+      let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);
 
       // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
       let length = self.write_utf8_uninit(
@@ -570,12 +573,12 @@ impl String {
         None,
         WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
       );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);
 
       // Return an owned string from this guaranteed now-initialized data
       let buffer = data as *mut u8;
       Cow::Owned(std::string::String::from_raw_parts(
-        buffer, length, capacity,
+        buffer, length, len_utf8,
       ))
     }
   }
diff --git a/tests/test_api.rs b/tests/test_api.rs
index d6ed8511..d654b7fb 100644
--- a/tests/test_api.rs
+++ b/tests/test_api.rs
@@ -241,6 +241,21 @@ fn global_handle_drop() {
 fn test_string() {
   let _setup_guard = setup::parallel_test();
   let isolate = &mut v8::Isolate::new(Default::default());
+  {
+    // Ensure that a Latin-1 string correctly round-trips
+    let scope = &mut v8::HandleScope::new(isolate);
+    let reference = "\u{00a0}";
+    assert_eq!(2, reference.len());
+    let local = v8::String::new(scope, reference).unwrap();
+    assert_eq!(1, local.length());
+    assert_eq!(2, local.utf8_length(scope));
+    // Should round-trip to UTF-8
+    assert_eq!(2, local.to_rust_string_lossy(scope).len());
+    let mut buf = [MaybeUninit::uninit(); 0];
+    assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len());
+    let mut buf = [MaybeUninit::uninit(); 10];
+    assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len());
+  }
   {
     let scope = &mut v8::HandleScope::new(isolate);
     let reference = "Hello 🦕 world!";