From ad0a65d0a571c5d9c783ac28d94fbbc2c7d59000 Mon Sep 17 00:00:00 2001 From: Matt Mastracci Date: Wed, 28 Jun 2023 07:46:50 -0600 Subject: [PATCH] feat: Use MaybeUninit for to_rust_string_lossy and add to_rust_cow_lossy (#1256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartek Iwańczuk --- src/string.rs | 163 ++++++++++++++++++++++++++++++++++++++++++---- tests/test_api.rs | 40 ++++++++++++ 2 files changed, 192 insertions(+), 11 deletions(-) diff --git a/src/string.rs b/src/string.rs index 55a5d92b..8c6efbf0 100644 --- a/src/string.rs +++ b/src/string.rs @@ -1,6 +1,6 @@ +use std::borrow::Cow; use std::convert::TryInto; use std::default::Default; -use std::mem::forget; use std::mem::MaybeUninit; use std::slice; @@ -420,22 +420,163 @@ impl String { unsafe { v8__String__ContainsOnlyOneByte(self) } } + /// Creates a copy of a [`crate::String`] in a [`std::string::String`]. /// Convenience function not present in the original V8 API. #[inline(always)] pub fn to_rust_string_lossy( &self, scope: &mut Isolate, ) -> std::string::String { + if self.is_onebyte() { + let len_utf16 = self.length(); + unsafe { + // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid + // accidentally creating a slice of u8 which would be invalid. + let layout = std::alloc::Layout::from_size_align(len_utf16, 1).unwrap(); + let data = std::alloc::alloc(layout) as *mut MaybeUninit; + let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf16); + + // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer + let length = self.write_one_byte_uninit( + scope, + &mut *buffer, + 0, + WriteOptions::NO_NULL_TERMINATION + | WriteOptions::REPLACE_INVALID_UTF8, + ); + debug_assert!(length == len_utf16); + + // Return an owned string from this guaranteed now-initialized data + let buffer = data as *mut u8; + return std::string::String::from_raw_parts(buffer, length, len_utf16); + } + } + let capacity = self.utf8_length(scope); - let mut string = std::string::String::with_capacity(capacity); - let data = string.as_mut_ptr(); - forget(string); - let length = self.write_utf8( - scope, - unsafe { slice::from_raw_parts_mut(data, capacity) }, - None, - WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, - ); - unsafe { std::string::String::from_raw_parts(data, length, capacity) } + // SAFETY: This allocates a buffer manually using the default allocator using the string's capacity. + // We have a large number of invariants to uphold, so please check changes to this code carefully + unsafe { + // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid + // accidentally creating a slice of u8 which would be invalid. + let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap(); + let data = std::alloc::alloc(layout) as *mut MaybeUninit; + let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity); + + // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer + let length = self.write_utf8_uninit( + scope, + &mut *buffer, + None, + WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, + ); + debug_assert!(length == capacity); + + // Return an owned string from this guaranteed now-initialized data + let buffer = data as *mut u8; + std::string::String::from_raw_parts(buffer, length, capacity) + } + } + + /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the + /// provided buffer. + #[inline(always)] + pub fn to_rust_cow_lossy<'a, const N: usize>( + &self, + scope: &mut Isolate, + buffer: &'a mut [MaybeUninit; N], + ) -> Cow<'a, str> { + // TODO(mmastrac): Ideally we should be able to access the string's internal representation + + let len_utf16 = self.length(); + if self.is_onebyte() { + if len_utf16 <= N { + let length = self.write_one_byte_uninit( + scope, + buffer, + 0, + WriteOptions::NO_NULL_TERMINATION, + ); + debug_assert!(length == len_utf16); + unsafe { + // Get a slice of &[u8] of what we know is initialized now + let buffer = &mut buffer[..length]; + let buffer = &mut *(buffer as *mut [_] as *mut [u8]); + + // We know it's valid UTF-8, so make a string + return Cow::Borrowed(std::str::from_utf8_unchecked(buffer)); + } + } + + unsafe { + // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid + // accidentally creating a slice of u8 which would be invalid. + let layout = std::alloc::Layout::from_size_align(len_utf16, 1).unwrap(); + let data = std::alloc::alloc(layout) as *mut MaybeUninit; + let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf16); + + // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer + let length = self.write_one_byte_uninit( + scope, + &mut *buffer, + 0, + WriteOptions::NO_NULL_TERMINATION + | WriteOptions::REPLACE_INVALID_UTF8, + ); + debug_assert!(length == len_utf16); + + // Return an owned string from this guaranteed now-initialized data + let buffer = data as *mut u8; + return Cow::Owned(std::string::String::from_raw_parts( + buffer, length, len_utf16, + )); + } + } + + let capacity = self.utf8_length(scope); + if capacity <= N { + // No malloc path + let length = self.write_utf8_uninit( + scope, + buffer, + None, + WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, + ); + debug_assert!(length == capacity); + + // SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information. + unsafe { + // Get a slice of &[u8] of what we know is initialized now + let buffer = &mut buffer[..length]; + let buffer = &mut *(buffer as *mut [_] as *mut [u8]); + + // We know it's valid UTF-8, so make a string + return Cow::Borrowed(std::str::from_utf8_unchecked(buffer)); + } + } + + // SAFETY: This allocates a buffer manually using the default allocator using the string's capacity. + // We have a large number of invariants to uphold, so please check changes to this code carefully + unsafe { + // Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid + // accidentally creating a slice of u8 which would be invalid. + let layout = std::alloc::Layout::from_size_align(capacity, 1).unwrap(); + let data = std::alloc::alloc(layout) as *mut MaybeUninit; + let buffer = std::ptr::slice_from_raw_parts_mut(data, capacity); + + // Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer + let length = self.write_utf8_uninit( + scope, + &mut *buffer, + None, + WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8, + ); + debug_assert!(length == capacity); + + // Return an owned string from this guaranteed now-initialized data + let buffer = data as *mut u8; + Cow::Owned(std::string::String::from_raw_parts( + buffer, length, capacity, + )) + } } } diff --git a/tests/test_api.rs b/tests/test_api.rs index 2ebb3403..d6ed8511 100644 --- a/tests/test_api.rs +++ b/tests/test_api.rs @@ -1,6 +1,7 @@ // Copyright 2019-2021 the Deno authors. All rights reserved. MIT license. use once_cell::sync::Lazy; use std::any::type_name; +use std::borrow::Cow; use std::cell::RefCell; use std::collections::hash_map::DefaultHasher; use std::collections::HashMap; @@ -410,6 +411,45 @@ fn test_string() { let invalid_4_octet_sequence = valid_6_octet_sequence.unwrap(); assert_eq!(invalid_4_octet_sequence.length(), 6); } + { + let scope = &mut v8::HandleScope::new(isolate); + let s = "Lorem ipsum dolor sit amet. Qui inventore debitis et voluptas cupiditate qui recusandae molestias et ullam possimus"; + let one_byte = v8::String::new_from_one_byte( + scope, + s.as_bytes(), + v8::NewStringType::Normal, + ) + .unwrap(); + + // Does not fit + let mut buffer = [MaybeUninit::uninit(); 10]; + let cow = one_byte.to_rust_cow_lossy(scope, &mut buffer); + assert!(matches!(cow, Cow::Owned(_))); + assert_eq!(s, cow); + + // Fits + let mut buffer = [MaybeUninit::uninit(); 1000]; + let cow = one_byte.to_rust_cow_lossy(scope, &mut buffer); + assert!(matches!(cow, Cow::Borrowed(_))); + assert_eq!(s, cow); + + let s = "🦕 Lorem ipsum dolor sit amet. Qui inventore debitis et voluptas cupiditate qui recusandae molestias et ullam possimus"; + let two_bytes = + v8::String::new_from_utf8(scope, s.as_bytes(), v8::NewStringType::Normal) + .unwrap(); + + // Does not fit + let mut buffer = [MaybeUninit::uninit(); 10]; + let cow = two_bytes.to_rust_cow_lossy(scope, &mut buffer); + assert!(matches!(cow, Cow::Owned(_))); + assert_eq!(s, cow); + + // Fits + let mut buffer = [MaybeUninit::uninit(); 1000]; + let cow = two_bytes.to_rust_cow_lossy(scope, &mut buffer); + assert!(matches!(cow, Cow::Borrowed(_))); + assert_eq!(s, cow); + } } #[test]