fix: Ensure that one-byte strings that are not ASCII go through write_utf8_uninit (#1261)

2025-01-13 01:22:42 -05:00 · 2023-06-30 09:46:29 -06:00 · 2023-06-30 09:46:29 -06:00 · d706291c5d
commit d706291c5d
parent c4033caf23
2 changed files with 36 additions and 18 deletions
--- a/src/string.rs
+++ b/src/string.rs
@ -422,13 +422,16 @@ impl String {

  /// Creates a copy of a [`crate::String`] in a [`std::string::String`].
  /// Convenience function not present in the original V8 API.
-  #[inline(always)]
  pub fn to_rust_string_lossy(
    &self,
    scope: &mut Isolate,
  ) -> std::string::String {
-    if self.is_onebyte() {
-      let len_utf16 = self.length();
+    let len_utf8 = self.utf8_length(scope);
+    let len_utf16 = self.length();
+
+    // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
+    // string is 100% 7-bit ASCII.
+    if self.is_onebyte() && len_utf8 == len_utf16 {
      unsafe {
        // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
        // accidentally creating a slice of u8 which would be invalid.
@ -452,15 +455,14 @@ impl String {
      }
    }

-    let capacity = self.utf8_length(scope);
    // SAFETY: This allocates a buffer manually using the default allocator using the string's capacity.
    // We have a large number of invariants to uphold, so please check changes to this code carefully
    unsafe {
      // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
      // accidentally creating a slice of u8 which would be invalid.
-      let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap();
+      let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
      let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
-      let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity);
+      let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);

      // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
      let length = self.write_utf8_uninit(
@ -469,26 +471,28 @@ impl String {
        None,
        WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
      );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);

      // Return an owned string from this guaranteed now-initialized data
      let buffer = data as *mut u8;
-      std::string::String::from_raw_parts(buffer, length, capacity)
+      std::string::String::from_raw_parts(buffer, length, len_utf8)
    }
  }

  /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
  /// provided buffer.
-  #[inline(always)]
  pub fn to_rust_cow_lossy<'a, const N: usize>(
    &self,
    scope: &mut Isolate,
    buffer: &'a mut [MaybeUninit<u8>; N],
  ) -> Cow<'a, str> {
    // TODO(mmastrac): Ideally we should be able to access the string's internal representation
-
+    let len_utf8 = self.utf8_length(scope);
    let len_utf16 = self.length();
-    if self.is_onebyte() {
+
+    // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
+    // string is 100% 7-bit ASCII.
+    if self.is_onebyte() && len_utf8 == len_utf16 {
      if len_utf16 <= N {
        let length = self.write_one_byte_uninit(
          scope,
@ -532,8 +536,7 @@ impl String {
      }
    }

-    let capacity = self.utf8_length(scope);
-    if capacity <= N {
+    if len_utf8 <= N {
      // No malloc path
      let length = self.write_utf8_uninit(
        scope,
@ -541,7 +544,7 @@ impl String {
        None,
        WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
      );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);

      // SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information.
      unsafe {
@ -559,9 +562,9 @@ impl String {
    unsafe {
      // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
      // accidentally creating a slice of u8 which would be invalid.
-      let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap();
+      let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
      let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
-      let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity);
+      let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);

      // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
      let length = self.write_utf8_uninit(
@ -570,12 +573,12 @@ impl String {
        None,
        WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
      );
-      debug_assert!(length == capacity);
+      debug_assert!(length == len_utf8);

      // Return an owned string from this guaranteed now-initialized data
      let buffer = data as *mut u8;
      Cow::Owned(std::string::String::from_raw_parts(
-        buffer, length, capacity,
+        buffer, length, len_utf8,
      ))
    }
  }
--- a/tests/test_api.rs
+++ b/tests/test_api.rs
@ -241,6 +241,21 @@ fn global_handle_drop() {
 fn test_string() {
  let _setup_guard = setup::parallel_test();
  let isolate = &mut v8::Isolate::new(Default::default());
+  {
+    // Ensure that a Latin-1 string correctly round-trips
+    let scope = &mut v8::HandleScope::new(isolate);
+    let reference = "\u{00a0}";
+    assert_eq!(2, reference.len());
+    let local = v8::String::new(scope, reference).unwrap();
+    assert_eq!(1, local.length());
+    assert_eq!(2, local.utf8_length(scope));
+    // Should round-trip to UTF-8
+    assert_eq!(2, local.to_rust_string_lossy(scope).len());
+    let mut buf = [MaybeUninit::uninit(); 0];
+    assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len());
+    let mut buf = [MaybeUninit::uninit(); 10];
+    assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len());
+  }
  {
    let scope = &mut v8::HandleScope::new(isolate);
    let reference = "Hello 🦕 world!";