mirror of
https://github.com/denoland/rusty_v8.git
synced 2024-11-24 15:19:31 -05:00
perf: speed up v8::String::to_rust_*_lossy()
This commit speeds up this common conversion method between by 2x for many common cases. Short one byte ASCII strings are now 20% faster. Longer one byte ASCII strings are 2.5x faster. Short UTF8 strings are marginally slower (5%) but longer UTF8 strings are upwards of 2x faster. A follow up will make the short UTF8 strings about 2x faster than the current implementation as well.
This commit is contained in:
parent
e67f11bf79
commit
6fcb6a9a0c
3 changed files with 317 additions and 156 deletions
11
Cargo.lock
generated
11
Cargo.lock
generated
|
@ -1260,6 +1260,16 @@ version = "1.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "simdutf"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c1945a45633804474a6f1aef87f072d7564c6421025a865f6777709a571fdfae"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.5.0",
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "slotmap"
|
name = "slotmap"
|
||||||
version = "1.0.7"
|
version = "1.0.7"
|
||||||
|
@ -1456,6 +1466,7 @@ dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"paste",
|
"paste",
|
||||||
"rustversion",
|
"rustversion",
|
||||||
|
"simdutf",
|
||||||
"trybuild",
|
"trybuild",
|
||||||
"which",
|
"which",
|
||||||
]
|
]
|
||||||
|
|
|
@ -91,6 +91,7 @@ use_custom_libcxx = []
|
||||||
bitflags = "2.5"
|
bitflags = "2.5"
|
||||||
once_cell = "1.19"
|
once_cell = "1.19"
|
||||||
paste = "1.0"
|
paste = "1.0"
|
||||||
|
simdutf = "0.5.1"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
miniz_oxide = "0.7.2"
|
miniz_oxide = "0.7.2"
|
||||||
|
|
461
src/string.rs
461
src/string.rs
|
@ -11,6 +11,7 @@ use std::borrow::Cow;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::default::Default;
|
use std::default::Default;
|
||||||
use std::ffi::c_void;
|
use std::ffi::c_void;
|
||||||
|
use std::hint::unreachable_unchecked;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::mem::MaybeUninit;
|
use std::mem::MaybeUninit;
|
||||||
use std::ptr::NonNull;
|
use std::ptr::NonNull;
|
||||||
|
@ -768,62 +769,12 @@ impl String {
|
||||||
&self,
|
&self,
|
||||||
scope: &mut Isolate,
|
scope: &mut Isolate,
|
||||||
) -> std::string::String {
|
) -> std::string::String {
|
||||||
let len_utf16 = self.length();
|
// SAFETY: @devsnek said it is fine.
|
||||||
|
let string = unsafe { Local::from_raw(self).unwrap_unchecked() };
|
||||||
// No need to allocate or do any work for zero-length strings
|
let view = ValueView::new(scope, string);
|
||||||
if len_utf16 == 0 {
|
match view.data() {
|
||||||
return std::string::String::new();
|
ValueViewData::OneByte(bytes) => latin1_to_string(bytes),
|
||||||
}
|
ValueViewData::TwoByte(code_points) => wtf16_to_string(code_points),
|
||||||
|
|
||||||
let len_utf8 = self.utf8_length(scope);
|
|
||||||
|
|
||||||
// If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
|
|
||||||
// string is 100% 7-bit ASCII.
|
|
||||||
if self.is_onebyte() && len_utf8 == len_utf16 {
|
|
||||||
unsafe {
|
|
||||||
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
|
|
||||||
// accidentally creating a slice of u8 which would be invalid.
|
|
||||||
let layout = std::alloc::Layout::from_size_align(len_utf16, 1).unwrap();
|
|
||||||
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
|
|
||||||
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf16);
|
|
||||||
|
|
||||||
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
|
|
||||||
let length = self.write_one_byte_uninit(
|
|
||||||
scope,
|
|
||||||
&mut *buffer,
|
|
||||||
0,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION
|
|
||||||
| WriteOptions::REPLACE_INVALID_UTF8,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf16);
|
|
||||||
|
|
||||||
// Return an owned string from this guaranteed now-initialized data
|
|
||||||
let buffer = data as *mut u8;
|
|
||||||
return std::string::String::from_raw_parts(buffer, length, len_utf16);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// SAFETY: This allocates a buffer manually using the default allocator using the string's capacity.
|
|
||||||
// We have a large number of invariants to uphold, so please check changes to this code carefully
|
|
||||||
unsafe {
|
|
||||||
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
|
|
||||||
// accidentally creating a slice of u8 which would be invalid.
|
|
||||||
let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
|
|
||||||
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
|
|
||||||
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);
|
|
||||||
|
|
||||||
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
|
|
||||||
let length = self.write_utf8_uninit(
|
|
||||||
scope,
|
|
||||||
&mut *buffer,
|
|
||||||
None,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf8);
|
|
||||||
|
|
||||||
// Return an owned string from this guaranteed now-initialized data
|
|
||||||
let buffer = data as *mut u8;
|
|
||||||
std::string::String::from_raw_parts(buffer, length, len_utf8)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -834,110 +785,308 @@ impl String {
|
||||||
scope: &mut Isolate,
|
scope: &mut Isolate,
|
||||||
buffer: &'a mut [MaybeUninit<u8>; N],
|
buffer: &'a mut [MaybeUninit<u8>; N],
|
||||||
) -> Cow<'a, str> {
|
) -> Cow<'a, str> {
|
||||||
let len_utf16 = self.length();
|
// SAFETY: @devsnek said it is fine.
|
||||||
|
let string = unsafe { Local::from_raw(self).unwrap_unchecked() };
|
||||||
// No need to allocate or do any work for zero-length strings
|
let view = ValueView::new(scope, string);
|
||||||
if len_utf16 == 0 {
|
match view.data() {
|
||||||
return "".into();
|
ValueViewData::OneByte(bytes) => latin1_to_cow_str(bytes, buffer),
|
||||||
}
|
ValueViewData::TwoByte(code_points) => {
|
||||||
|
wtf16_to_cow_str(code_points, buffer)
|
||||||
// TODO(mmastrac): Ideally we should be able to access the string's internal representation
|
|
||||||
let len_utf8 = self.utf8_length(scope);
|
|
||||||
|
|
||||||
// If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
|
|
||||||
// string is 100% 7-bit ASCII.
|
|
||||||
if self.is_onebyte() && len_utf8 == len_utf16 {
|
|
||||||
if len_utf16 <= N {
|
|
||||||
let length = self.write_one_byte_uninit(
|
|
||||||
scope,
|
|
||||||
buffer,
|
|
||||||
0,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf16);
|
|
||||||
unsafe {
|
|
||||||
// Get a slice of &[u8] of what we know is initialized now
|
|
||||||
let buffer = &mut buffer[..length];
|
|
||||||
let buffer = &mut *(buffer as *mut [_] as *mut [u8]);
|
|
||||||
|
|
||||||
// We know it's valid UTF-8, so make a string
|
|
||||||
return Cow::Borrowed(std::str::from_utf8_unchecked(buffer));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe {
|
|
||||||
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
|
|
||||||
// accidentally creating a slice of u8 which would be invalid.
|
|
||||||
let layout = std::alloc::Layout::from_size_align(len_utf16, 1).unwrap();
|
|
||||||
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
|
|
||||||
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf16);
|
|
||||||
|
|
||||||
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
|
|
||||||
let length = self.write_one_byte_uninit(
|
|
||||||
scope,
|
|
||||||
&mut *buffer,
|
|
||||||
0,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION
|
|
||||||
| WriteOptions::REPLACE_INVALID_UTF8,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf16);
|
|
||||||
|
|
||||||
// Return an owned string from this guaranteed now-initialized data
|
|
||||||
let buffer = data as *mut u8;
|
|
||||||
return Cow::Owned(std::string::String::from_raw_parts(
|
|
||||||
buffer, length, len_utf16,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len_utf8 <= N {
|
|
||||||
// No malloc path
|
|
||||||
let length = self.write_utf8_uninit(
|
|
||||||
scope,
|
|
||||||
buffer,
|
|
||||||
None,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf8);
|
|
||||||
|
|
||||||
// SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information.
|
|
||||||
unsafe {
|
|
||||||
// Get a slice of &[u8] of what we know is initialized now
|
|
||||||
let buffer = &mut buffer[..length];
|
|
||||||
let buffer = &mut *(buffer as *mut [_] as *mut [u8]);
|
|
||||||
|
|
||||||
// We know it's valid UTF-8, so make a string
|
|
||||||
return Cow::Borrowed(std::str::from_utf8_unchecked(buffer));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// SAFETY: This allocates a buffer manually using the default allocator using the string's capacity.
|
|
||||||
// We have a large number of invariants to uphold, so please check changes to this code carefully
|
|
||||||
unsafe {
|
|
||||||
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
|
|
||||||
// accidentally creating a slice of u8 which would be invalid.
|
|
||||||
let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
|
|
||||||
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
|
|
||||||
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);
|
|
||||||
|
|
||||||
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
|
|
||||||
let length = self.write_utf8_uninit(
|
|
||||||
scope,
|
|
||||||
&mut *buffer,
|
|
||||||
None,
|
|
||||||
WriteOptions::NO_NULL_TERMINATION | WriteOptions::REPLACE_INVALID_UTF8,
|
|
||||||
);
|
|
||||||
debug_assert!(length == len_utf8);
|
|
||||||
|
|
||||||
// Return an owned string from this guaranteed now-initialized data
|
|
||||||
let buffer = data as *mut u8;
|
|
||||||
Cow::Owned(std::string::String::from_raw_parts(
|
|
||||||
buffer, length, len_utf8,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn latin1_to_string(bytes: &[u8]) -> std::string::String {
|
||||||
|
// Perf: it seems to be faster to check if the string is ASCII first and
|
||||||
|
// then do a memcpy if it is, rather than checking and copying each byte
|
||||||
|
// individually.
|
||||||
|
if bytes.is_ascii() {
|
||||||
|
// SAFETY: The string is ASCII, so it's valid UTF-8.
|
||||||
|
(unsafe { std::str::from_utf8_unchecked(bytes) }).to_owned()
|
||||||
|
} else {
|
||||||
|
// TODO: this could likely be optimized for large strings by using SIMD to
|
||||||
|
// calculate the length of the resulting string and then allocating once,
|
||||||
|
// and then converting the string using SIMD.
|
||||||
|
std::string::String::from_utf8_lossy(bytes).into_owned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The cutoff for when to use SIMD for converting WTF-16 to UTF-8. Any slice of
|
||||||
|
/// code points longer than this will use SIMD, and any shorter will use the
|
||||||
|
/// scalar implementation.
|
||||||
|
const WTF16_CODE_POINT_LENGTH_CUTOFF_FOR_SIMD: usize = 96;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn wtf16_to_string(code_points: &[u16]) -> std::string::String {
|
||||||
|
// If the code points are longer than the cutoff and are valid UTF-16, use
|
||||||
|
// SIMD to convert them to UTF-8. Otherwise we use the scalar implementation.
|
||||||
|
if code_points.len() > WTF16_CODE_POINT_LENGTH_CUTOFF_FOR_SIMD
|
||||||
|
&& simdutf::validate_utf16(code_points)
|
||||||
|
{
|
||||||
|
let len_utf8 = simdutf::utf8_length_from_utf16(code_points);
|
||||||
|
|
||||||
|
let buffer = allocate_byte_buffer(len_utf8);
|
||||||
|
|
||||||
|
// SAFETY: The buffer is large enough to hold the UTF-8 data.
|
||||||
|
let written = unsafe {
|
||||||
|
simdutf::convert_utf16_to_utf8(
|
||||||
|
code_points.as_ptr(),
|
||||||
|
code_points.len(),
|
||||||
|
buffer as *mut u8,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
debug_assert_eq!(written, len_utf8);
|
||||||
|
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
unsafe {
|
||||||
|
std::string::String::from_raw_parts(buffer as *mut u8, written, len_utf8)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let len_utf8 = utf8_length_from_utf16_vectorized(code_points);
|
||||||
|
|
||||||
|
let buffer = allocate_byte_buffer(len_utf8);
|
||||||
|
|
||||||
|
// SAFETY: The buffer is large enough to hold the UTF-8 data.
|
||||||
|
let written =
|
||||||
|
unsafe { wtf16_to_utf8_lossy(code_points, buffer as *mut u8) };
|
||||||
|
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
unsafe {
|
||||||
|
std::string::String::from_raw_parts(buffer as *mut u8, written, len_utf8)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn latin1_to_cow_str<'a, const N: usize>(
|
||||||
|
bytes: &[u8],
|
||||||
|
buffer: &'a mut [MaybeUninit<u8>; N],
|
||||||
|
) -> Cow<'a, str> {
|
||||||
|
let is_ascii = bytes.is_ascii();
|
||||||
|
if is_ascii && bytes.len() <= N {
|
||||||
|
// SAFETY: The string is ASCII, so it's valid UTF-8. We know that the
|
||||||
|
// buffer can not be overlapping, as we never expose a &mut to the
|
||||||
|
// v8::ValueViewData buffer.
|
||||||
|
let str = unsafe {
|
||||||
|
std::ptr::copy_nonoverlapping(
|
||||||
|
bytes.as_ptr(),
|
||||||
|
buffer.as_mut_ptr() as *mut u8,
|
||||||
|
bytes.len(),
|
||||||
|
);
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
buffer.as_ptr() as *const u8,
|
||||||
|
bytes.len(),
|
||||||
|
))
|
||||||
|
};
|
||||||
|
Cow::Borrowed(str)
|
||||||
|
} else if bytes.len() * 2 < N {
|
||||||
|
// SAFETY: The string is Latin1 - we need to convert to UTF-8. But it
|
||||||
|
// is short enough to fit into the buffer, because the buffer is at
|
||||||
|
// least twice as large as the string and any non-ASCII one-byte
|
||||||
|
// character will be encoded as exactly two bytes in UTF-8.
|
||||||
|
let written = unsafe {
|
||||||
|
latin1_to_utf8(
|
||||||
|
bytes.len(),
|
||||||
|
bytes.as_ptr(),
|
||||||
|
buffer.as_mut_ptr() as *mut u8,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
debug_assert!(written <= buffer.len());
|
||||||
|
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
let str = unsafe {
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
buffer.as_ptr() as *const u8,
|
||||||
|
written,
|
||||||
|
))
|
||||||
|
};
|
||||||
|
Cow::Borrowed(str)
|
||||||
|
} else if is_ascii {
|
||||||
|
// Perf: it seems to be faster to check if the string is ASCII first and
|
||||||
|
// then do a memcpy if it is, rather than checking and copying each byte
|
||||||
|
// individually.
|
||||||
|
|
||||||
|
// SAFETY: The string is ASCII, so it's valid UTF-8.
|
||||||
|
Cow::Owned((unsafe { std::str::from_utf8_unchecked(bytes) }).to_owned())
|
||||||
|
} else {
|
||||||
|
// TODO: this could likely be optimized for large strings by using SIMD to
|
||||||
|
// calculate the length of the resulting string and then allocating once,
|
||||||
|
// and then converting the string using SIMD.
|
||||||
|
Cow::Owned(std::string::String::from_utf8_lossy(bytes).into_owned())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn wtf16_to_cow_str<'a, const N: usize>(
|
||||||
|
code_points: &[u16],
|
||||||
|
buffer: &'a mut [MaybeUninit<u8>; N],
|
||||||
|
) -> Cow<'a, str> {
|
||||||
|
if code_points.len() >= WTF16_CODE_POINT_LENGTH_CUTOFF_FOR_SIMD
|
||||||
|
&& simdutf::validate_utf16(code_points)
|
||||||
|
{
|
||||||
|
let len_utf8 = simdutf::utf8_length_from_utf16(code_points);
|
||||||
|
|
||||||
|
let (buffer, owned) = if buffer.len() >= len_utf8 {
|
||||||
|
(buffer.as_mut_ptr(), false)
|
||||||
|
} else {
|
||||||
|
let buffer = allocate_byte_buffer(len_utf8);
|
||||||
|
(buffer, true)
|
||||||
|
};
|
||||||
|
|
||||||
|
// SAFETY: The buffer is large enough to hold the UTF-8 data.
|
||||||
|
let written = unsafe {
|
||||||
|
simdutf::convert_utf16_to_utf8(
|
||||||
|
code_points.as_ptr(),
|
||||||
|
code_points.len(),
|
||||||
|
buffer as *mut u8,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
if owned {
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
let str = unsafe {
|
||||||
|
std::string::String::from_raw_parts(
|
||||||
|
buffer as *mut u8,
|
||||||
|
written,
|
||||||
|
len_utf8,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
Cow::Owned(str)
|
||||||
|
} else {
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
let str = unsafe {
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
buffer as *const u8,
|
||||||
|
written,
|
||||||
|
))
|
||||||
|
};
|
||||||
|
Cow::Borrowed(str)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let len_utf8 = utf8_length_from_utf16_vectorized(code_points);
|
||||||
|
|
||||||
|
let (buffer, owned) = if buffer.len() >= len_utf8 {
|
||||||
|
(buffer.as_mut_ptr(), false)
|
||||||
|
} else {
|
||||||
|
let buffer = allocate_byte_buffer(len_utf8);
|
||||||
|
(buffer, true)
|
||||||
|
};
|
||||||
|
|
||||||
|
// SAFETY: The buffer is large enough to hold the UTF-8 data.
|
||||||
|
let written =
|
||||||
|
unsafe { wtf16_to_utf8_lossy(code_points, buffer as *mut u8) };
|
||||||
|
|
||||||
|
if owned {
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
let str = unsafe {
|
||||||
|
std::string::String::from_raw_parts(
|
||||||
|
buffer as *mut u8,
|
||||||
|
written,
|
||||||
|
len_utf8,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
Cow::Owned(str)
|
||||||
|
} else {
|
||||||
|
// SAFETY: The buffer is filled with valid UTF-8 data.
|
||||||
|
let str = unsafe {
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
buffer as *const u8,
|
||||||
|
written,
|
||||||
|
))
|
||||||
|
};
|
||||||
|
Cow::Borrowed(str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn allocate_byte_buffer(len: usize) -> *mut MaybeUninit<u8> {
|
||||||
|
debug_assert!(len > 0);
|
||||||
|
let layout = std::alloc::Layout::from_size_align(len, 1).unwrap();
|
||||||
|
// SAFETY: The layout is valid.
|
||||||
|
(unsafe { std::alloc::alloc(layout) }) as *mut MaybeUninit<u8>
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn utf8_length_from_utf16_vectorized(code_points: &[u16]) -> usize {
|
||||||
|
std::char::decode_utf16(code_points.into_iter().copied())
|
||||||
|
.map(|c| c.unwrap_or(std::char::REPLACEMENT_CHARACTER))
|
||||||
|
.map(|c| c.len_utf8())
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Expands `inbuf` to `outbuf`, assuming that `outbuf` has at least 2x `input_length`.
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn latin1_to_utf8(
|
||||||
|
input_length: usize,
|
||||||
|
inbuf: *const u8,
|
||||||
|
outbuf: *mut u8,
|
||||||
|
) -> usize {
|
||||||
|
let mut output = 0;
|
||||||
|
let mut input = 0;
|
||||||
|
while input < input_length {
|
||||||
|
let char = *(inbuf.add(input));
|
||||||
|
if char < 0x80 {
|
||||||
|
*(outbuf.add(output)) = char;
|
||||||
|
output += 1;
|
||||||
|
} else {
|
||||||
|
// Top two bits
|
||||||
|
*(outbuf.add(output)) = (char >> 6) | 0b1100_0000;
|
||||||
|
// Bottom six bits
|
||||||
|
*(outbuf.add(output + 1)) = (char & 0b0011_1111) | 0b1000_0000;
|
||||||
|
output += 2;
|
||||||
|
}
|
||||||
|
input += 1;
|
||||||
|
}
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn wtf16_to_utf8_lossy(input: &[u16], outbuf: *mut u8) -> usize {
|
||||||
|
let utf8 = std::char::decode_utf16(input.into_iter().copied());
|
||||||
|
let mut output = 0;
|
||||||
|
for c in utf8 {
|
||||||
|
let c = c.unwrap_or(std::char::REPLACEMENT_CHARACTER);
|
||||||
|
let len = c.len_utf8();
|
||||||
|
let code = c as u32;
|
||||||
|
const TAG_TWO_BYTE: u8 = 0xC0;
|
||||||
|
const TAG_THREE_BYTE: u8 = 0xE0;
|
||||||
|
const TAG_FOUR_BYTE: u8 = 0xF0;
|
||||||
|
const TAG_CONT: u8 = 0x80;
|
||||||
|
match len {
|
||||||
|
1 => {
|
||||||
|
*(outbuf.add(output)) = c as u8;
|
||||||
|
output += 1;
|
||||||
|
}
|
||||||
|
2 => {
|
||||||
|
*(outbuf.add(output)) = TAG_TWO_BYTE | ((code >> 6) as u8);
|
||||||
|
*(outbuf.add(output + 1)) = TAG_CONT | ((code & 0x3F) as u8);
|
||||||
|
output += 2;
|
||||||
|
}
|
||||||
|
3 => {
|
||||||
|
*(outbuf.add(output)) = TAG_THREE_BYTE | ((code >> 12) as u8);
|
||||||
|
*(outbuf.add(output + 1)) = TAG_CONT | (((code >> 6) & 0x3F) as u8);
|
||||||
|
*(outbuf.add(output + 2)) = TAG_CONT | ((code & 0x3F) as u8);
|
||||||
|
output += 3;
|
||||||
|
}
|
||||||
|
4 => {
|
||||||
|
*(outbuf.add(output)) = TAG_FOUR_BYTE | ((code >> 18) as u8);
|
||||||
|
*(outbuf.add(output + 1)) = TAG_CONT | (((code >> 12) & 0x3F) as u8);
|
||||||
|
*(outbuf.add(output + 2)) = TAG_CONT | (((code >> 6) & 0x3F) as u8);
|
||||||
|
*(outbuf.add(output + 3)) = TAG_CONT | ((code & 0x3F) as u8);
|
||||||
|
output += 4;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// SAFETY: We know that the length is 1, 2, 3, or 4.
|
||||||
|
unsafe { unreachable_unchecked() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
pub extern "C" fn free_rust_external_onebyte(s: *mut char, len: usize) {
|
pub extern "C" fn free_rust_external_onebyte(s: *mut char, len: usize) {
|
||||||
unsafe {
|
unsafe {
|
||||||
let slice = std::slice::from_raw_parts_mut(s, len);
|
let slice = std::slice::from_raw_parts_mut(s, len);
|
||||||
|
@ -970,7 +1119,7 @@ pub struct ValueView<'s>(
|
||||||
impl<'s> ValueView<'s> {
|
impl<'s> ValueView<'s> {
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn new(isolate: &mut Isolate, string: Local<'s, String>) -> Self {
|
pub fn new(isolate: &mut Isolate, string: Local<'s, String>) -> Self {
|
||||||
let mut v = std::mem::MaybeUninit::uninit();
|
let mut v: MaybeUninit<ValueView<'_>> = std::mem::MaybeUninit::uninit();
|
||||||
unsafe {
|
unsafe {
|
||||||
v8__String__ValueView__CONSTRUCT(v.as_mut_ptr(), isolate, &*string);
|
v8__String__ValueView__CONSTRUCT(v.as_mut_ptr(), isolate, &*string);
|
||||||
v.assume_init()
|
v.assume_init()
|
||||||
|
|
Loading…
Reference in a new issue