From d706291c5db10884ce7a309eee41e07637cd0cf3 Mon Sep 17 00:00:00 2001 From: Matt Mastracci Date: Fri, 30 Jun 2023 09:46:29 -0600 Subject: [PATCH] fix: Ensure that one-byte strings that are not ASCII go through write_utf8_uninit (#1261) --- src/string.rs | 39 +++++++++++++++++++++------------------ tests/test_api.rs | 15 +++++++++++++++ 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/string.rs b/src/string.rs index 8c6efbf0..772062eb 100644 --- a/src/string.rs +++ b/src/string.rs @@ -422,13 +422,16 @@ impl String { /// Creates a copy of a [`crate::String`] in a [`std::string::String`]. /// Convenience function not present in the original V8 API. - #[inline(always)] pub fn to_rust_string_lossy( &self, scope: &mut Isolate, ) -> std::string::String { - if self.is_onebyte() { - let len_utf16 = self.length(); + let len_utf8 = self.utf8_length(scope); + let len_utf16 = self.length(); + + // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the + // string is 100% 7-bit ASCII. + if self.is_onebyte() && len_utf8 == len_utf16 { unsafe { // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid // accidentally creating a slice of u8 which would be invalid. @@ -452,15 +455,14 @@ impl String { } } - let capacity = self.utf8_length(scope); // SAFETY: This allocates a buffer manually using the default allocator using the string's capacity. // We have a large number of invariants to uphold, so please check changes to this code carefully unsafe { // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid // accidentally creating a slice of u8 which would be invalid. - let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap(); + let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap(); let data = std::alloc::alloc(layout) as *mut MaybeUninit; - let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity); + let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8); // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer let length = self.write_utf8_uninit( @@ -469,26 +471,28 @@ impl String { None, WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, ); - debug_assert!(length == capacity); + debug_assert!(length == len_utf8); // Return an owned string from this guaranteed now-initialized data let buffer = data as *mut u8; - std::string::String::from_raw_parts(buffer, length, capacity) + std::string::String::from_raw_parts(buffer, length, len_utf8) } } /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the /// provided buffer. - #[inline(always)] pub fn to_rust_cow_lossy<'a, const N: usize>( &self, scope: &mut Isolate, buffer: &'a mut [MaybeUninit; N], ) -> Cow<'a, str> { // TODO(mmastrac): Ideally we should be able to access the string's internal representation - + let len_utf8 = self.utf8_length(scope); let len_utf16 = self.length(); - if self.is_onebyte() { + + // If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the + // string is 100% 7-bit ASCII. + if self.is_onebyte() && len_utf8 == len_utf16 { if len_utf16 <= N { let length = self.write_one_byte_uninit( scope, @@ -532,8 +536,7 @@ impl String { } } - let capacity = self.utf8_length(scope); - if capacity <= N { + if len_utf8 <= N { // No malloc path let length = self.write_utf8_uninit( scope, @@ -541,7 +544,7 @@ impl String { None, WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, ); - debug_assert!(length == capacity); + debug_assert!(length == len_utf8); // SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information. unsafe { @@ -559,9 +562,9 @@ impl String { unsafe { // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid // accidentally creating a slice of u8 which would be invalid. - let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap(); + let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap(); let data = std::alloc::alloc(layout) as *mut MaybeUninit; - let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity); + let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8); // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer let length = self.write_utf8_uninit( @@ -570,12 +573,12 @@ impl String { None, WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, ); - debug_assert!(length == capacity); + debug_assert!(length == len_utf8); // Return an owned string from this guaranteed now-initialized data let buffer = data as *mut u8; Cow::Owned(std::string::String::from_raw_parts( - buffer, length, capacity, + buffer, length, len_utf8, )) } } diff --git a/tests/test_api.rs b/tests/test_api.rs index d6ed8511..d654b7fb 100644 --- a/tests/test_api.rs +++ b/tests/test_api.rs @@ -241,6 +241,21 @@ fn global_handle_drop() { fn test_string() { let _setup_guard = setup::parallel_test(); let isolate = &mut v8::Isolate::new(Default::default()); + { + // Ensure that a Latin-1 string correctly round-trips + let scope = &mut v8::HandleScope::new(isolate); + let reference = "\u{00a0}"; + assert_eq!(2, reference.len()); + let local = v8::String::new(scope, reference).unwrap(); + assert_eq!(1, local.length()); + assert_eq!(2, local.utf8_length(scope)); + // Should round-trip to UTF-8 + assert_eq!(2, local.to_rust_string_lossy(scope).len()); + let mut buf = [MaybeUninit::uninit(); 0]; + assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len()); + let mut buf = [MaybeUninit::uninit(); 10]; + assert_eq!(2, local.to_rust_cow_lossy(scope, &mut buf).len()); + } { let scope = &mut v8::HandleScope::new(isolate); let reference = "Hello 🦕 world!";