From dd428d1dc8e182c26185b70c619dcc71bc5d5d05 Mon Sep 17 00:00:00 2001 From: Arnau Orriols Date: Wed, 7 Sep 2022 08:53:56 +0200 Subject: [PATCH] feat(ext/ffi): Implement FFI fast-call trampoline with Dynasmrt (#15305) --- .gitmodules | 4 +- Cargo.lock | 37 + ext/ffi/Cargo.toml | 1 + ext/ffi/build.rs | 70 - ext/ffi/fast_call.rs | 2065 +++++++++++++++++++++++++++ ext/ffi/jit_trampoline.rs | 263 ---- ext/ffi/lib.rs | 114 +- ext/ffi/prelude.h | 36 - ext/ffi/tcc.rs | 116 -- ext/ffi/tinycc | 1 - test_ffi/Cargo.toml | 1 + test_ffi/src/lib.rs | 54 + test_ffi/tests/integration_tests.rs | 14 + test_ffi/tests/test.js | 105 +- test_util/Cargo.toml | 2 +- 15 files changed, 2282 insertions(+), 601 deletions(-) delete mode 100644 ext/ffi/build.rs create mode 100644 ext/ffi/fast_call.rs delete mode 100644 ext/ffi/jit_trampoline.rs delete mode 100644 ext/ffi/prelude.h delete mode 100644 ext/ffi/tcc.rs delete mode 160000 ext/ffi/tinycc diff --git a/.gitmodules b/.gitmodules index a94ebe6689..9e4f12afa6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,6 +9,4 @@ [submodule "test_util/wpt"] path = test_util/wpt url = https://github.com/web-platform-tests/wpt.git -[submodule "ext/ffi/tinycc"] - path = ext/ffi/tinycc - url = https://github.com/TinyCC/tinycc + diff --git a/Cargo.lock b/Cargo.lock index 7a35eafd2b..fba5040678 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1035,6 +1035,7 @@ version = "0.54.0" dependencies = [ "deno_core", "dlopen", + "dynasmrt", "libffi", "serde", "tokio", @@ -1467,6 +1468,32 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21e50f3adc76d6a43f5ed73b698a87d0760ca74617f60f7c3b879003536fdd28" +[[package]] +name = "dynasm" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add9a102807b524ec050363f09e06f1504214b0e1c7797f64261c891022dce8b" +dependencies = [ + "bitflags", + "byteorder", + "lazy_static", + "proc-macro-error", + "proc-macro2 1.0.39", + "quote 1.0.18", + "syn 1.0.96", +] + +[[package]] +name = "dynasmrt" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64fba5a42bd76a17cad4bfa00de168ee1cbfa06a5e8ce992ae880218c05641a9" +dependencies = [ + "byteorder", + "dynasm", + "memmap2", +] + [[package]] name = "ecdsa" version = "0.14.1" @@ -2657,6 +2684,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memmap2" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a79b39c93a7a5a27eeaf9a23b5ff43f1b9e0ad6b1cdd441140ae53c35613fc7" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.6.5" @@ -4590,6 +4626,7 @@ dependencies = [ name = "test_ffi" version = "0.1.0" dependencies = [ + "pretty_assertions", "test_util", ] diff --git a/ext/ffi/Cargo.toml b/ext/ffi/Cargo.toml index af866317f1..be094c3dcd 100644 --- a/ext/ffi/Cargo.toml +++ b/ext/ffi/Cargo.toml @@ -16,6 +16,7 @@ path = "lib.rs" [dependencies] deno_core = { version = "0.149.0", path = "../../core" } dlopen = "0.1.8" +dynasmrt = "1.2.3" libffi = "3.0.0" serde = { version = "1.0.129", features = ["derive"] } tokio = { version = "1.17", features = ["full"] } diff --git a/ext/ffi/build.rs b/ext/ffi/build.rs deleted file mode 100644 index 1debd6b9c0..0000000000 --- a/ext/ffi/build.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license. - -#[cfg(not(target_os = "windows"))] -fn build_tcc() { - use std::env; - - { - // TODO(@littledivy): Windows support for fast call. - // let tcc_path = root - // .parent() - // .unwrap() - // .to_path_buf() - // .parent() - // .unwrap() - // .to_path_buf() - // .join("third_party") - // .join("prebuilt") - // .join("win"); - // println!("cargo:rustc-link-search=native={}", tcc_path.display()); - } - #[cfg(not(target_os = "windows"))] - { - use std::path::PathBuf; - use std::process::exit; - use std::process::Command; - - let root = PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"))); - let tcc_src = root.join("tinycc"); - dbg!(&tcc_src); - let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); - let mut configure = Command::new(tcc_src.join("configure")); - configure.current_dir(&out_dir); - configure.args(&["--enable-static", "--extra-cflags=-fPIC -O3 -g -static"]); - let status = configure.status().unwrap(); - if !status.success() { - eprintln!("Fail to configure: {:?}", status); - exit(1); - } - - let mut make = Command::new("make"); - make.current_dir(&out_dir).arg(format!( - "-j{}", - env::var("NUM_JOBS").unwrap_or_else(|_| String::from("1")) - )); - make.args(&["libtcc.a"]); - let status = make.status().unwrap(); - - if !status.success() { - eprintln!("Fail to make: {:?}", status); - exit(1); - } - println!("cargo:rustc-link-search=native={}", out_dir.display()); - println!("cargo:rerun-if-changed={}", tcc_src.display()); - } -} - -#[cfg(target_os = "windows")] -fn main() {} - -#[cfg(not(target_os = "windows"))] -fn main() { - use std::env; - - if let Ok(tcc_path) = env::var("TCC_PATH") { - println!("cargo:rustc-link-search=native={}", tcc_path); - } else { - build_tcc(); - } - println!("cargo:rustc-link-lib=static=tcc"); -} diff --git a/ext/ffi/fast_call.rs b/ext/ffi/fast_call.rs new file mode 100644 index 0000000000..dc098a69aa --- /dev/null +++ b/ext/ffi/fast_call.rs @@ -0,0 +1,2065 @@ +// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license. + +use std::cmp::max; +use std::ffi::c_void; +use std::iter::once; + +use deno_core::v8::fast_api; +use dynasmrt::dynasm; +use dynasmrt::DynasmApi; +use dynasmrt::ExecutableBuffer; + +use crate::needs_unwrap; +use crate::NativeType; +use crate::Symbol; + +pub(crate) fn is_compatible(sym: &Symbol) -> bool { + cfg!(any( + all(target_arch = "x86_64", target_family = "unix"), + all(target_arch = "x86_64", target_family = "windows"), + all(target_arch = "aarch64", target_vendor = "apple") + )) && !sym.can_callback +} + +pub(crate) fn compile_trampoline(sym: &Symbol) -> Trampoline { + #[cfg(all(target_arch = "x86_64", target_family = "unix"))] + return SysVAmd64::compile(sym); + #[cfg(all(target_arch = "x86_64", target_family = "windows"))] + return Win64::compile(sym); + #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))] + return Aarch64Apple::compile(sym); + #[allow(unreachable_code)] + { + unimplemented!("fast API is not implemented for the current target"); + } +} + +pub(crate) fn make_template(sym: &Symbol, trampoline: &Trampoline) -> Template { + let mut params = once(fast_api::Type::V8Value) // Receiver + .chain(sym.parameter_types.iter().map(|t| t.into())) + .collect::>(); + + let ret = if needs_unwrap(sym.result_type) { + params.push(fast_api::Type::TypedArray(fast_api::CType::Int32)); + fast_api::Type::Void + } else { + fast_api::Type::from(&sym.result_type) + }; + + Template { + args: params.into_boxed_slice(), + ret: (&ret).into(), + symbol_ptr: trampoline.ptr(), + } +} + +/// Trampoline for fast-call FFI functions +/// +/// Calls the FFI function without the first argument (the receiver) +pub(crate) struct Trampoline(ExecutableBuffer); + +impl Trampoline { + fn ptr(&self) -> *const c_void { + &self.0[0] as *const u8 as *const c_void + } +} + +pub(crate) struct Template { + args: Box<[fast_api::Type]>, + ret: fast_api::CType, + symbol_ptr: *const c_void, +} + +impl fast_api::FastFunction for Template { + fn function(&self) -> *const c_void { + self.symbol_ptr + } + + fn args(&self) -> &'static [fast_api::Type] { + Box::leak(self.args.clone()) + } + + fn return_type(&self) -> fast_api::CType { + self.ret + } +} + +impl From<&NativeType> for fast_api::Type { + fn from(native_type: &NativeType) -> Self { + match native_type { + NativeType::Bool => fast_api::Type::Bool, + NativeType::U8 | NativeType::U16 | NativeType::U32 => { + fast_api::Type::Uint32 + } + NativeType::I8 | NativeType::I16 | NativeType::I32 => { + fast_api::Type::Int32 + } + NativeType::F32 => fast_api::Type::Float32, + NativeType::F64 => fast_api::Type::Float64, + NativeType::Void => fast_api::Type::Void, + NativeType::I64 => fast_api::Type::Int64, + NativeType::U64 => fast_api::Type::Uint64, + NativeType::ISize => fast_api::Type::Int64, + NativeType::USize | NativeType::Pointer | NativeType::Function => { + fast_api::Type::Uint64 + } + NativeType::Buffer => fast_api::Type::TypedArray(fast_api::CType::Uint8), + } + } +} + +macro_rules! x64 { + ($assembler:expr; $($tokens:tt)+) => { + dynasm!($assembler; .arch x64; $($tokens)+) + } +} + +macro_rules! aarch64 { + ($assembler:expr; $($tokens:tt)+) => { + dynasm!($assembler; .arch aarch64; $($tokens)+) + } +} + +struct SysVAmd64 { + // Reference: https://refspecs.linuxfoundation.org/elf/x86_64-abi-0.99.pdf + assmblr: dynasmrt::x64::Assembler, + // Parameter counters + integral_params: u32, + float_params: u32, + // Stack offset accumulators + offset_trampoline: u32, + offset_callee: u32, + allocated_stack: u32, + frame_pointer: u32, +} + +#[cfg_attr( + not(all(target_aarch = "x86_64", target_family = "unix")), + allow(dead_code) +)] +impl SysVAmd64 { + // Integral arguments go to the following GPR, in order: rdi, rsi, rdx, rcx, r8, r9 + const INTEGRAL_REGISTERS: u32 = 6; + // SSE arguments go to the first 8 SSE registers: xmm0-xmm7 + const FLOAT_REGISTERS: u32 = 8; + + fn new() -> Self { + Self { + assmblr: dynasmrt::x64::Assembler::new().unwrap(), + integral_params: 0, + float_params: 0, + // Start at 8 to account for trampoline caller's return address + offset_trampoline: 8, + // default to tail-call mode. If a new stack frame is allocated this becomes 0 + offset_callee: 8, + allocated_stack: 0, + frame_pointer: 0, + } + } + + fn compile(sym: &Symbol) -> Trampoline { + let mut compiler = Self::new(); + + let must_cast_return_value = + compiler.must_cast_return_value(sym.result_type); + let must_wrap_return_value = + compiler.must_wrap_return_value_in_typed_array(sym.result_type); + let must_save_preserved_register = must_wrap_return_value; + let cannot_tailcall = must_cast_return_value || must_wrap_return_value; + + if cannot_tailcall { + if must_save_preserved_register { + compiler.save_preserved_register_to_stack(); + } + compiler.allocate_stack(&sym.parameter_types); + } + + for param in sym.parameter_types.iter().copied() { + compiler.move_left(param) + } + if !compiler.is_recv_arg_overridden() { + // the receiver object should never be expected. Avoid its unexpected or deliberate leak + compiler.zero_first_arg(); + } + if must_wrap_return_value { + compiler.save_out_array_to_preserved_register(); + } + + if cannot_tailcall { + compiler.call(sym.ptr.as_ptr()); + if must_cast_return_value { + compiler.cast_return_value(sym.result_type); + } + if must_wrap_return_value { + compiler.wrap_return_value_in_out_array(); + } + compiler.deallocate_stack(); + if must_save_preserved_register { + compiler.recover_preserved_register(); + } + compiler.ret(); + } else { + compiler.tailcall(sym.ptr.as_ptr()); + } + + Trampoline(compiler.finalize()) + } + + fn move_left(&mut self, param: NativeType) { + // Section 3.2.3 of the SysV ABI spec, on argument classification: + // - INTEGER: + // > Arguments of types (signed and unsigned) _Bool, char, short, int, + // > long, long long, and pointers are in the INTEGER class. + // - SSE: + // > Arguments of types float, double, _Decimal32, _Decimal64 and + // > __m64 are in class SSE. + match param.into() { + Int(integral) => self.move_integral(integral), + Float(float) => self.move_float(float), + } + } + + fn move_float(&mut self, param: Floating) { + // Section 3.2.3 of the SysV AMD64 ABI: + // > If the class is SSE, the next available vector register is used, the registers + // > are taken in the order from %xmm0 to %xmm7. + // [...] + // > Once registers are assigned, the arguments passed in memory are pushed on + // > the stack in reversed (right-to-left) order + let param_i = self.float_params; + + let is_in_stack = param_i >= Self::FLOAT_REGISTERS; + // floats are only moved to accommodate integer movement in the stack + let stack_has_moved = self.allocated_stack > 0 + || self.integral_params >= Self::INTEGRAL_REGISTERS; + + if is_in_stack && stack_has_moved { + let s = &mut self.assmblr; + let ot = self.offset_trampoline as i32; + let oc = self.offset_callee as i32; + match param { + Single => x64!(s + ; movss xmm8, [rsp + ot] + ; movss [rsp + oc], xmm8 + ), + Double => x64!(s + ; movsd xmm8, [rsp + ot] + ; movsd [rsp + oc], xmm8 + ), + } + + // Section 3.2.3 of the SysV AMD64 ABI: + // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned. + self.offset_trampoline += 8; + self.offset_callee += 8; + + debug_assert!( + self.allocated_stack == 0 || self.offset_callee <= self.allocated_stack + ); + } + self.float_params += 1; + } + + fn move_integral(&mut self, arg: Integral) { + // Section 3.2.3 of the SysV AMD64 ABI: + // > If the class is INTEGER, the next available register of the sequence %rdi, + // > %rsi, %rdx, %rcx, %r8 and %r9 is used + // [...] + // > Once registers are assigned, the arguments passed in memory are pushed on + // > the stack in reversed (right-to-left) order + let s = &mut self.assmblr; + let param_i = self.integral_params; + + // move each argument one position to the left. The first argument in the stack moves to the last integer register (r9). + // If the FFI function is called with a new stack frame, the arguments remaining in the stack are copied to the new stack frame. + // Otherwise, they are copied 8 bytes lower in the same frame + match (param_i, arg) { + // u8 and u16 parameters are defined as u32 parameters in the V8's fast API function. The trampoline takes care of the cast. + // Conventionally, many compilers expect 8 and 16 bit arguments to be sign/zero extended by the caller + // See https://stackoverflow.com/a/36760539/2623340 + (0, U(B)) => x64!(s; movzx edi, sil), + (0, I(B)) => x64!(s; movsx edi, sil), + (0, U(W)) => x64!(s; movzx edi, si), + (0, I(W)) => x64!(s; movsx edi, si), + (0, U(DW) | I(DW)) => x64!(s; mov edi, esi), + (0, U(QW) | I(QW)) => x64!(s; mov rdi, rsi), + // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray struct + // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200 + // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823 + (0, Buffer) => x64!(s; mov rdi, [rsi + 8]), + + (1, U(B)) => x64!(s; movzx esi, dl), + (1, I(B)) => x64!(s; movsx esi, dl), + (1, U(W)) => x64!(s; movzx esi, dx), + (1, I(W)) => x64!(s; movsx esi, dx), + (1, U(DW) | I(DW)) => x64!(s; mov esi, edx), + (1, U(QW) | I(QW)) => x64!(s; mov rsi, rdx), + (1, Buffer) => x64!(s; mov rsi, [rdx + 8]), + + (2, U(B)) => x64!(s; movzx edx, cl), + (2, I(B)) => x64!(s; movsx edx, cl), + (2, U(W)) => x64!(s; movzx edx, cx), + (2, I(W)) => x64!(s; movsx edx, cx), + (2, U(DW) | I(DW)) => x64!(s; mov edx, ecx), + (2, U(QW) | I(QW)) => x64!(s; mov rdx, rcx), + (2, Buffer) => x64!(s; mov rdx, [rcx + 8]), + + (3, U(B)) => x64!(s; movzx ecx, r8b), + (3, I(B)) => x64!(s; movsx ecx, r8b), + (3, U(W)) => x64!(s; movzx ecx, r8w), + (3, I(W)) => x64!(s; movsx ecx, r8w), + (3, U(DW) | I(DW)) => x64!(s; mov ecx, r8d), + (3, U(QW) | I(QW)) => x64!(s; mov rcx, r8), + (3, Buffer) => x64!(s; mov rcx, [r8 + 8]), + + (4, U(B)) => x64!(s; movzx r8d, r9b), + (4, I(B)) => x64!(s; movsx r8d, r9b), + (4, U(W)) => x64!(s; movzx r8d, r9w), + (4, I(W)) => x64!(s; movsx r8d, r9w), + (4, U(DW) | I(DW)) => x64!(s; mov r8d, r9d), + (4, U(QW) | I(QW)) => x64!(s; mov r8, r9), + (4, Buffer) => x64!(s; mov r8, [r9 + 8]), + + (5, param) => { + let ot = self.offset_trampoline as i32; + // First argument in stack goes to last register (r9) + match param { + U(B) => x64!(s; movzx r9d, BYTE [rsp + ot]), + I(B) => x64!(s; movsx r9d, BYTE [rsp + ot]), + U(W) => x64!(s; movzx r9d, WORD [rsp + ot]), + I(W) => x64!(s; movsx r9d, WORD [rsp + ot]), + U(DW) | I(DW) => x64!(s; mov r9d, [rsp + ot]), + U(QW) | I(QW) => x64!(s; mov r9, [rsp + ot]), + Buffer => x64!(s + ; mov r9, [rsp + ot] + ; mov r9, [r9 + 8] + ), + } + // Section 3.2.3 of the SysV AMD64 ABI: + // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned. + self.offset_trampoline += 8; + } + + (6.., param) => { + let ot = self.offset_trampoline as i32; + let oc = self.offset_callee as i32; + match param { + U(B) => x64!(s + // TODO: optimize to [rsp] (without immediate) when offset is 0 + ; movzx eax, BYTE [rsp + ot] + ; mov [rsp + oc], eax + ), + I(B) => x64!(s + ; movsx eax, BYTE [rsp + ot] + ; mov [rsp + oc], eax + ), + U(W) => x64!(s + ; movzx eax, WORD [rsp + ot] + ; mov [rsp + oc], eax + ), + I(W) => x64!(s + ; movsx eax, WORD [rsp + ot] + ; mov [rsp + oc], eax + ), + U(DW) | I(DW) => x64!(s + ; mov eax, [rsp + ot] + ; mov [rsp + oc], eax + ), + U(QW) | I(QW) => x64!(s + ; mov rax, [rsp + ot] + ; mov [rsp + oc], rax + ), + Buffer => x64!(s + ; mov rax, [rsp + ot] + ; mov rax, [rax + 8] + ; mov [rsp + oc], rax + ), + } + // Section 3.2.3 of the SysV AMD64 ABI: + // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned. + self.offset_trampoline += 8; + self.offset_callee += 8; + + debug_assert!( + self.allocated_stack == 0 + || self.offset_callee <= self.allocated_stack + ); + } + } + self.integral_params += 1; + } + + fn zero_first_arg(&mut self) { + debug_assert!( + self.integral_params == 0, + "the trampoline would zero the first argument after having overridden it with the second one" + ); + dynasm!(self.assmblr + ; .arch x64 + ; xor edi, edi + ); + } + + fn cast_return_value(&mut self, rv: NativeType) { + let s = &mut self.assmblr; + // V8 only supports 32bit integers. We support 8 and 16 bit integers casting them to 32bits. + // In SysV-AMD64 the convention dictates that the unused bits of the return value contain garbage, so we + // need to zero/sign extend the return value explicitly + match rv { + NativeType::U8 => x64!(s; movzx eax, al), + NativeType::I8 => x64!(s; movsx eax, al), + NativeType::U16 => x64!(s; movzx eax, ax), + NativeType::I16 => x64!(s; movsx eax, ax), + _ => (), + } + } + + fn save_out_array_to_preserved_register(&mut self) { + let s = &mut self.assmblr; + // functions returning 64 bit integers have the out array appended as their last parameter, + // and it is a *FastApiTypedArray + match self.integral_params { + // Trampoline's signature is (receiver, [param0, param1, ...], *FastApiTypedArray) + // self.integral_params account only for the original params [param0, param1, ...] + // and the out array has not been moved left + 0 => x64!(s; mov rbx, [rsi + 8]), + 1 => x64!(s; mov rbx, [rdx + 8]), + 2 => x64!(s; mov rbx, [rcx + 8]), + 3 => x64!(s; mov rbx, [r8 + 8]), + 4 => x64!(s; mov rbx, [r9 + 8]), + 5.. => { + x64!(s + ; mov rax, [rsp + self.offset_trampoline as i32] + ; mov rbx, [rax + 8] + ) + } + } + } + + fn wrap_return_value_in_out_array(&mut self) { + x64!(self.assmblr; mov [rbx], rax); + } + + fn save_preserved_register_to_stack(&mut self) { + x64!(self.assmblr; push rbx); + self.offset_trampoline += 8; + // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack + self.offset_callee = 0; + self.frame_pointer += 8; + } + + fn recover_preserved_register(&mut self) { + debug_assert!( + self.frame_pointer >= 8, + "the trampoline would try to pop from the stack beyond its frame pointer" + ); + x64!(self.assmblr; pop rbx); + self.frame_pointer -= 8; + // parameter offsets are invalid once this method is called + } + + fn allocate_stack(&mut self, params: &[NativeType]) { + let mut int_params = 0u32; + let mut float_params = 0u32; + for param in params { + match param { + NativeType::F32 | NativeType::F64 => float_params += 1, + _ => int_params += 1, + } + } + let mut stack_size = (int_params.saturating_sub(Self::INTEGRAL_REGISTERS) + + float_params.saturating_sub(Self::FLOAT_REGISTERS)) + * 8; + + // Align new stack frame (accounting for the 8 byte of the trampoline caller's return address + // and any other potential addition to the stack prior to this allocation) + // Section 3.2.2 of the SysV AMD64 ABI: + // > The end of the input argument area shall be aligned on a 16 (32 or 64, if + // > __m256 or __m512 is passed on stack) byte boundary. In other words, the value + // > (%rsp + 8) is always a multiple of 16 (32 or 64) when control is transferred to + // > the function entry point. The stack pointer, %rsp, always points to the end of the + // > latest allocated stack frame. + stack_size += padding_to_align(16, self.frame_pointer + stack_size + 8); + + if stack_size > 0 { + x64!(self.assmblr; sub rsp, stack_size as i32); + self.offset_trampoline += stack_size; + // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack + self.offset_callee = 0; + self.allocated_stack += stack_size; + self.frame_pointer += stack_size; + } + } + + fn deallocate_stack(&mut self) { + debug_assert!( + self.frame_pointer >= self.allocated_stack, + "the trampoline would try to deallocate stack beyond its frame pointer" + ); + if self.allocated_stack > 0 { + x64!(self.assmblr; add rsp, self.allocated_stack as i32); + + self.frame_pointer -= self.allocated_stack; + self.allocated_stack = 0; + } + } + + fn call(&mut self, ptr: *const c_void) { + // the stack has been aligned during stack allocation and/or pushing of preserved registers + debug_assert!( + (8 + self.frame_pointer) % 16 == 0, + "the trampoline would call the FFI function with an unaligned stack" + ); + x64!(self.assmblr + ; mov rax, QWORD ptr as _ + ; call rax + ); + } + + fn tailcall(&mut self, ptr: *const c_void) { + // stack pointer is never modified and remains aligned + // return address remains the one provided by the trampoline's caller (V8) + debug_assert!( + self.allocated_stack == 0, + "the trampoline would tail call the FFI function with an outstanding stack allocation" + ); + debug_assert!( + self.frame_pointer == 0, + "the trampoline would tail call the FFI function with outstanding locals in the frame" + ); + x64!(self.assmblr + ; mov rax, QWORD ptr as _ + ; jmp rax + ); + } + + fn ret(&mut self) { + debug_assert!( + self.allocated_stack == 0, + "the trampoline would return with an outstanding stack allocation" + ); + debug_assert!( + self.frame_pointer == 0, + "the trampoline would return with outstanding locals in the frame" + ); + x64!(self.assmblr; ret); + } + + fn is_recv_arg_overridden(&self) -> bool { + // V8 receiver is the first parameter of the trampoline function and is a pointer + self.integral_params > 0 + } + + fn must_cast_return_value(&self, rv: NativeType) -> bool { + // V8 only supports i32 and u32 return types for integers + // We support 8 and 16 bit integers by extending them to 32 bits in the trampoline before returning + matches!( + rv, + NativeType::U8 | NativeType::I8 | NativeType::U16 | NativeType::I16 + ) + } + + fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool { + // V8 only supports i32 and u32 return types for integers + // We support 64 bit integers by wrapping them in a TypedArray out parameter + crate::needs_unwrap(rv) + } + + fn finalize(self) -> ExecutableBuffer { + self.assmblr.finalize().unwrap() + } +} + +struct Aarch64Apple { + // Reference https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst + assmblr: dynasmrt::aarch64::Assembler, + // Parameter counters + integral_params: u32, + float_params: u32, + // Stack offset accumulators + offset_trampoline: u32, + offset_callee: u32, + allocated_stack: u32, +} + +#[cfg_attr( + not(all(target_aarch = "aarch64", target_vendor = "apple")), + allow(dead_code) +)] +impl Aarch64Apple { + // Integral arguments go to the first 8 GPR: x0-x7 + const INTEGRAL_REGISTERS: u32 = 8; + // Floating-point arguments go to the first 8 SIMD & Floating-Point registers: v0-v1 + const FLOAT_REGISTERS: u32 = 8; + + fn new() -> Self { + Self { + assmblr: dynasmrt::aarch64::Assembler::new().unwrap(), + integral_params: 0, + float_params: 0, + offset_trampoline: 0, + offset_callee: 0, + allocated_stack: 0, + } + } + + fn compile(sym: &Symbol) -> Trampoline { + let mut compiler = Self::new(); + + let must_wrap_return_value = + compiler.must_wrap_return_value_in_typed_array(sym.result_type); + let must_save_preserved_register = must_wrap_return_value; + let cannot_tailcall = must_wrap_return_value; + + if cannot_tailcall { + compiler.allocate_stack(sym); + compiler.save_frame_record(); + if compiler.must_save_preserved_register_to_stack(sym) { + compiler.save_preserved_register_to_stack(); + } + } + + for param in sym.parameter_types.iter().copied() { + compiler.move_left(param) + } + if !compiler.is_recv_arg_overridden() { + // the receiver object should never be expected. Avoid its unexpected or deliberate leak + compiler.zero_first_arg(); + } + if compiler.must_wrap_return_value_in_typed_array(sym.result_type) { + compiler.save_out_array_to_preserved_register(); + } + + if cannot_tailcall { + compiler.call(sym.ptr.as_ptr()); + if must_wrap_return_value { + compiler.wrap_return_value_in_out_array(); + } + if must_save_preserved_register { + compiler.recover_preserved_register(); + } + compiler.recover_frame_record(); + compiler.deallocate_stack(); + compiler.ret(); + } else { + compiler.tailcall(sym.ptr.as_ptr()); + } + + Trampoline(compiler.finalize()) + } + + fn move_left(&mut self, param: NativeType) { + // Section 6.4.2 of the Aarch64 Procedure Call Standard (PCS), on argument classification: + // - INTEGRAL or POINTER: + // > If the argument is an Integral or Pointer Type, the size of the argument is less than or equal to 8 bytes + // > and the NGRN is less than 8, the argument is copied to the least significant bits in x[NGRN]. + // + // - Floating-Point or Vector: + // > If the argument is a Half-, Single-, Double- or Quad- precision Floating-point or short vector type + // > and the NSRN is less than 8, then the argument is allocated to the least significant bits of register v[NSRN] + match param.into() { + Int(integral) => self.move_integral(integral), + Float(float) => self.move_float(float), + } + } + + fn move_float(&mut self, param: Floating) { + // Section 6.4.2 of the Aarch64 PCS: + // > If the argument is a Half-, Single-, Double- or Quad- precision Floating-point or short vector type and the NSRN is less than 8, then the + // > argument is allocated to the least significant bits of register v[NSRN]. The NSRN is incremented by one. The argument has now been allocated. + // > [if NSRN is equal or more than 8] + // > The argument is copied to memory at the adjusted NSAA. The NSAA is incremented by the size of the argument. The argument has now been allocated. + let param_i = self.float_params; + + let is_in_stack = param_i >= Self::FLOAT_REGISTERS; + if is_in_stack { + // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms: + // > Function arguments may consume slots on the stack that are not multiples of 8 bytes. + // (i.e. natural alignment instead of eightbyte alignment) + let padding_trampl = + (param.size() - self.offset_trampoline % param.size()) % param.size(); + let padding_callee = + (param.size() - self.offset_callee % param.size()) % param.size(); + + // floats are only moved to accommodate integer movement in the stack + let stack_has_moved = self.integral_params >= Self::INTEGRAL_REGISTERS; + if stack_has_moved { + let s = &mut self.assmblr; + let ot = self.offset_trampoline; + let oc = self.offset_callee; + match param { + Single => aarch64!(s + // 6.1.2 Aarch64 PCS: + // > Registers v8-v15 must be preserved by a callee across subroutine calls; + // > the remaining registers (v0-v7, v16-v31) do not need to be preserved (or should be preserved by the caller). + ; ldr s16, [sp, ot + padding_trampl] + ; str s16, [sp, oc + padding_callee] + ), + Double => aarch64!(s + ; ldr d16, [sp, ot + padding_trampl] + ; str d16, [sp, oc + padding_callee] + ), + } + } + self.offset_trampoline += padding_trampl + param.size(); + self.offset_callee += padding_callee + param.size(); + + debug_assert!( + self.allocated_stack == 0 || self.offset_callee <= self.allocated_stack + ); + } + self.float_params += 1; + } + + fn move_integral(&mut self, param: Integral) { + let s = &mut self.assmblr; + // Section 6.4.2 of the Aarch64 PCS: + // If the argument is an Integral or Pointer Type, the size of the argument is less than or + // equal to 8 bytes and the NGRN is less than 8, the argument is copied to the least + // significant bits in x[NGRN]. The NGRN is incremented by one. The argument has now been + // allocated. + // [if NGRN is equal or more than 8] + // The argument is copied to memory at the adjusted NSAA. The NSAA is incremented by the size + // of the argument. The argument has now been allocated. + let param_i = self.integral_params; + + // move each argument one position to the left. The first argument in the stack moves to the last integer register (x7). + match (param_i, param) { + // From https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms: + // > The caller of a function is responsible for signing or zero-extending any argument with fewer than 32 bits. + // > The standard ABI expects the callee to sign or zero-extend those arguments. + // (this applies to register parameters, as stack parameters are not eightbyte aligned in Apple) + (0, I(B)) => aarch64!(s; sxtb w0, w1), + (0, U(B)) => aarch64!(s; and w0, w1, 0xFF), + (0, I(W)) => aarch64!(s; sxth w0, w1), + (0, U(W)) => aarch64!(s; and w0, w1, 0xFFFF), + (0, I(DW) | U(DW)) => aarch64!(s; mov w0, w1), + (0, I(QW) | U(QW)) => aarch64!(s; mov x0, x1), + // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray struct + // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200 + // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823 + (0, Buffer) => aarch64!(s; ldr x0, [x1, 8]), + + (1, I(B)) => aarch64!(s; sxtb w1, w2), + (1, U(B)) => aarch64!(s; and w1, w2, 0xFF), + (1, I(W)) => aarch64!(s; sxth w1, w2), + (1, U(W)) => aarch64!(s; and w1, w2, 0xFFFF), + (1, I(DW) | U(DW)) => aarch64!(s; mov w1, w2), + (1, I(QW) | U(QW)) => aarch64!(s; mov x1, x2), + (1, Buffer) => aarch64!(s; ldr x1, [x2, 8]), + + (2, I(B)) => aarch64!(s; sxtb w2, w3), + (2, U(B)) => aarch64!(s; and w2, w3, 0xFF), + (2, I(W)) => aarch64!(s; sxth w2, w3), + (2, U(W)) => aarch64!(s; and w2, w3, 0xFFFF), + (2, I(DW) | U(DW)) => aarch64!(s; mov w2, w3), + (2, I(QW) | U(QW)) => aarch64!(s; mov x2, x3), + (2, Buffer) => aarch64!(s; ldr x2, [x3, 8]), + + (3, I(B)) => aarch64!(s; sxtb w3, w4), + (3, U(B)) => aarch64!(s; and w3, w4, 0xFF), + (3, I(W)) => aarch64!(s; sxth w3, w4), + (3, U(W)) => aarch64!(s; and w3, w4, 0xFFFF), + (3, I(DW) | U(DW)) => aarch64!(s; mov w3, w4), + (3, I(QW) | U(QW)) => aarch64!(s; mov x3, x4), + (3, Buffer) => aarch64!(s; ldr x3, [x4, 8]), + + (4, I(B)) => aarch64!(s; sxtb w4, w5), + (4, U(B)) => aarch64!(s; and w4, w5, 0xFF), + (4, I(W)) => aarch64!(s; sxth w4, w5), + (4, U(W)) => aarch64!(s; and w4, w5, 0xFFFF), + (4, I(DW) | U(DW)) => aarch64!(s; mov w4, w5), + (4, I(QW) | U(QW)) => aarch64!(s; mov x4, x5), + (4, Buffer) => aarch64!(s; ldr x4, [x5, 8]), + + (5, I(B)) => aarch64!(s; sxtb w5, w6), + (5, U(B)) => aarch64!(s; and w5, w6, 0xFF), + (5, I(W)) => aarch64!(s; sxth w5, w6), + (5, U(W)) => aarch64!(s; and w5, w6, 0xFFFF), + (5, I(DW) | U(DW)) => aarch64!(s; mov w5, w6), + (5, I(QW) | U(QW)) => aarch64!(s; mov x5, x6), + (5, Buffer) => aarch64!(s; ldr x5, [x6, 8]), + + (6, I(B)) => aarch64!(s; sxtb w6, w7), + (6, U(B)) => aarch64!(s; and w6, w7, 0xFF), + (6, I(W)) => aarch64!(s; sxth w6, w7), + (6, U(W)) => aarch64!(s; and w6, w7, 0xFFFF), + (6, I(DW) | U(DW)) => aarch64!(s; mov w6, w7), + (6, I(QW) | U(QW)) => aarch64!(s; mov x6, x7), + (6, Buffer) => aarch64!(s; ldr x6, [x7, 8]), + + (7, param) => { + let ot = self.offset_trampoline; + match param { + I(B) => { + aarch64!(s; ldrsb w7, [sp, ot]) + } + U(B) => { + // ldrb zero-extends the byte to fill the 32bits of the register + aarch64!(s; ldrb w7, [sp, ot]) + } + I(W) => { + aarch64!(s; ldrsh w7, [sp, ot]) + } + U(W) => { + // ldrh zero-extends the half-word to fill the 32bits of the register + aarch64!(s; ldrh w7, [sp, ot]) + } + I(DW) | U(DW) => { + aarch64!(s; ldr w7, [sp, ot]) + } + I(QW) | U(QW) => { + aarch64!(s; ldr x7, [sp, ot]) + } + Buffer => { + aarch64!(s + ; ldr x7, [sp, ot] + ; ldr x7, [x7, 8] + ) + } + } + // 16 and 8 bit integers are 32 bit integers in v8 + self.offset_trampoline += max(param.size(), 4); + } + + (8.., param) => { + // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms: + // > Function arguments may consume slots on the stack that are not multiples of 8 bytes. + // (i.e. natural alignment instead of eightbyte alignment) + // + // N.B. V8 does not currently follow this Apple's policy, and instead aligns all arguments to 8 Byte boundaries. + // The current implementation follows the V8 incorrect calling convention for the sake of a seamless experience + // for the Deno users. Whenever upgrading V8 we should make sure that the bug has not been amended, and revert this + // workaround once it has been. The bug is being tracked in https://bugs.chromium.org/p/v8/issues/detail?id=13171 + let size_original = param.size(); + // 16 and 8 bit integers are 32 bit integers in v8 + // let size_trampl = max(size_original, 4); // <-- Apple alignment + let size_trampl = 8; // <-- V8 incorrect alignment + let padding_trampl = + padding_to_align(size_trampl, self.offset_trampoline); + let padding_callee = + padding_to_align(size_original, self.offset_callee); + let ot = self.offset_trampoline; + let oc = self.offset_callee; + match param { + I(B) | U(B) => aarch64!(s + ; ldr w8, [sp, ot + padding_trampl] + ; strb w8, [sp, oc + padding_callee] + ), + I(W) | U(W) => aarch64!(s + ; ldr w8, [sp, ot + padding_trampl] + ; strh w8, [sp, oc + padding_callee] + ), + I(DW) | U(DW) => aarch64!(s + ; ldr w8, [sp, ot + padding_trampl] + ; str w8, [sp, oc + padding_callee] + ), + I(QW) | U(QW) => aarch64!(s + ; ldr x8, [sp, ot + padding_trampl] + ; str x8, [sp, oc + padding_callee] + ), + Buffer => aarch64!(s + ; ldr x8, [sp, ot + padding_trampl] + ; ldr x8, [x8, 8] + ; str x8, [sp, oc + padding_callee] + ), + } + self.offset_trampoline += padding_trampl + size_trampl; + self.offset_callee += padding_callee + size_original; + + debug_assert!( + self.allocated_stack == 0 + || self.offset_callee <= self.allocated_stack + ); + } + }; + self.integral_params += 1; + } + + fn zero_first_arg(&mut self) { + debug_assert!( + self.integral_params == 0, + "the trampoline would zero the first argument after having overridden it with the second one" + ); + aarch64!(self.assmblr; mov x0, xzr); + } + + fn save_out_array_to_preserved_register(&mut self) { + let s = &mut self.assmblr; + // functions returning 64 bit integers have the out array appended as their last parameter, + // and it is a *FastApiTypedArray + match self.integral_params { + // x0 is always V8's receiver + 0 => aarch64!(s; ldr x19, [x1, 8]), + 1 => aarch64!(s; ldr x19, [x2, 8]), + 2 => aarch64!(s; ldr x19, [x3, 8]), + 3 => aarch64!(s; ldr x19, [x4, 8]), + 4 => aarch64!(s; ldr x19, [x5, 8]), + 5 => aarch64!(s; ldr x19, [x6, 8]), + 6 => aarch64!(s; ldr x19, [x7, 8]), + 7.. => { + aarch64!(s + ; ldr x19, [sp, self.offset_trampoline] + ; ldr x19, [x19, 8] + ) + } + } + } + + fn wrap_return_value_in_out_array(&mut self) { + aarch64!(self.assmblr; str x0, [x19]); + } + + fn save_frame_record(&mut self) { + debug_assert!( + self.allocated_stack >= 16, + "the trampoline would try to save the frame record to the stack without having allocated enough space for it" + ); + aarch64!(self.assmblr + // Frame record is stored at the bottom of the stack frame + ; stp x29, x30, [sp, self.allocated_stack - 16] + ; add x29, sp, self.allocated_stack - 16 + ) + } + + fn recover_frame_record(&mut self) { + // The stack cannot have been deallocated before the frame record is restored + debug_assert!( + self.allocated_stack >= 16, + "the trampoline would try to load the frame record from the stack, but it couldn't possibly contain it" + ); + // Frame record is stored at the bottom of the stack frame + aarch64!(self.assmblr; ldp x29, x30, [sp, self.allocated_stack - 16]) + } + + fn save_preserved_register_to_stack(&mut self) { + // If a preserved register needs to be used, we must have allocated at least 32 bytes in the stack + // 16 for the frame record, 8 for the preserved register, and 8 for 16-byte alignment. + debug_assert!( + self.allocated_stack >= 32, + "the trampoline would try to save a register to the stack without having allocated enough space for it" + ); + // preserved register is stored after frame record + aarch64!(self.assmblr; str x19, [sp, self.allocated_stack - 24]); + } + + fn recover_preserved_register(&mut self) { + // The stack cannot have been deallocated before the preserved register is restored + // 16 for the frame record, 8 for the preserved register, and 8 for 16-byte alignment. + debug_assert!( + self.allocated_stack >= 32, + "the trampoline would try to recover the value of a register from the stack, but it couldn't possibly contain it" + ); + // preserved register is stored after frame record + aarch64!(self.assmblr; ldr x19, [sp, self.allocated_stack - 24]); + } + + fn allocate_stack(&mut self, symbol: &Symbol) { + // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms: + // > Function arguments may consume slots on the stack that are not multiples of 8 bytes. + // (i.e. natural alignment instead of eightbyte alignment) + let mut int_params = 0u32; + let mut float_params = 0u32; + let mut stack_size = 0u32; + for param in symbol.parameter_types.iter().copied() { + match param.into() { + Float(float_param) => { + float_params += 1; + if float_params > Self::FLOAT_REGISTERS { + stack_size += float_param.size(); + } + } + Int(integral_param) => { + int_params += 1; + if int_params > Self::INTEGRAL_REGISTERS { + stack_size += integral_param.size(); + } + } + } + } + + // Section 6.2.3 of the Aarch64 PCS: + // > Each frame shall link to the frame of its caller by means of a frame record of two 64-bit values on the stack + stack_size += 16; + + if self.must_save_preserved_register_to_stack(symbol) { + stack_size += 8; + } + + // Section 6.2.2 of Aarch64 PCS: + // > At any point at which memory is accessed via SP, the hardware requires that + // > - SP mod 16 = 0. The stack must be quad-word aligned. + // > The stack must also conform to the following constraint at a public interface: + // > - SP mod 16 = 0. The stack must be quad-word aligned. + stack_size += padding_to_align(16, stack_size); + + if stack_size > 0 { + aarch64!(self.assmblr; sub sp, sp, stack_size); + self.offset_trampoline += stack_size; + // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack + self.offset_callee = 0; + self.allocated_stack += stack_size; + } + } + + fn deallocate_stack(&mut self) { + if self.allocated_stack > 0 { + aarch64!(self.assmblr; add sp, sp, self.allocated_stack); + self.allocated_stack = 0; + } + } + + fn call(&mut self, ptr: *const c_void) { + // the stack has been aligned during stack allocation + // Frame record has been stored in stack and frame pointer points to it + debug_assert!( + self.allocated_stack % 16 == 0, + "the trampoline would call the FFI function with an unaligned stack" + ); + debug_assert!( + self.allocated_stack >= 16, + "the trampoline would call the FFI function without allocating enough stack for the frame record" + ); + self.load_callee_address(ptr); + aarch64!(self.assmblr; blr x8); + } + + fn tailcall(&mut self, ptr: *const c_void) { + // stack pointer is never modified and remains aligned + // frame pointer and link register remain the one provided by the trampoline's caller (V8) + debug_assert!( + self.allocated_stack == 0, + "the trampoline would tail call the FFI function with an outstanding stack allocation" + ); + self.load_callee_address(ptr); + aarch64!(self.assmblr; br x8); + } + + fn ret(&mut self) { + debug_assert!( + self.allocated_stack == 0, + "the trampoline would return with an outstanding stack allocation" + ); + aarch64!(self.assmblr; ret); + } + + fn load_callee_address(&mut self, ptr: *const c_void) { + // Like all ARM instructions, move instructions are 32bit long and can fit at most 16bit immediates. + // bigger immediates are loaded in multiple steps applying a left-shift modifier + let mut address = ptr as u64; + let mut imm16 = address & 0xFFFF; + aarch64!(self.assmblr; movz x8, imm16 as u32); + address >>= 16; + let mut shift = 16; + while address > 0 { + imm16 = address & 0xFFFF; + if imm16 != 0 { + aarch64!(self.assmblr; movk x8, imm16 as u32, lsl shift); + } + address >>= 16; + shift += 16; + } + } + + fn is_recv_arg_overridden(&self) -> bool { + // V8 receiver is the first parameter of the trampoline function and is a pointer + self.integral_params > 0 + } + + fn must_save_preserved_register_to_stack(&mut self, symbol: &Symbol) -> bool { + self.must_wrap_return_value_in_typed_array(symbol.result_type) + } + + fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool { + // V8 only supports i32 and u32 return types for integers + // We support 64 bit integers by wrapping them in a TypedArray out parameter + crate::needs_unwrap(rv) + } + + fn finalize(self) -> ExecutableBuffer { + self.assmblr.finalize().unwrap() + } +} + +struct Win64 { + // Reference: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-calling-convention.md + assmblr: dynasmrt::x64::Assembler, + // Params counter (Windows does not distinguish by type with regards to parameter position) + params: u32, + // Stack offset accumulators + offset_trampoline: u32, + offset_callee: u32, + allocated_stack: u32, + frame_pointer: u32, +} + +#[cfg_attr( + not(all(target_aarch = "x86_64", target_family = "windows")), + allow(dead_code) +)] +impl Win64 { + // Section "Parameter Passing" of the Windows x64 calling convention: + // > By default, the x64 calling convention passes the first four arguments to a function in registers. + const REGISTERS: u32 = 4; + + fn new() -> Self { + Self { + assmblr: dynasmrt::x64::Assembler::new().unwrap(), + params: 0, + // trampoline caller's return address + trampoline's shadow space + offset_trampoline: 8 + 32, + offset_callee: 8 + 32, + allocated_stack: 0, + frame_pointer: 0, + } + } + + fn compile(sym: &Symbol) -> Trampoline { + let mut compiler = Self::new(); + + let must_cast_return_value = + compiler.must_cast_return_value(sym.result_type); + let must_wrap_return_value = + compiler.must_wrap_return_value_in_typed_array(sym.result_type); + let must_save_preserved_register = must_wrap_return_value; + let cannot_tailcall = must_cast_return_value || must_wrap_return_value; + + if cannot_tailcall { + if must_save_preserved_register { + compiler.save_preserved_register_to_stack(); + } + compiler.allocate_stack(&sym.parameter_types); + } + + for param in sym.parameter_types.iter().copied() { + compiler.move_left(param) + } + if !compiler.is_recv_arg_overridden() { + // the receiver object should never be expected. Avoid its unexpected or deliberate leak + compiler.zero_first_arg(); + } + if must_wrap_return_value { + compiler.save_out_array_to_preserved_register(); + } + + if cannot_tailcall { + compiler.call(sym.ptr.as_ptr()); + if must_cast_return_value { + compiler.cast_return_value(sym.result_type); + } + if must_wrap_return_value { + compiler.wrap_return_value_in_out_array(); + } + compiler.deallocate_stack(); + if must_save_preserved_register { + compiler.recover_preserved_register(); + } + compiler.ret(); + } else { + compiler.tailcall(sym.ptr.as_ptr()); + } + + Trampoline(compiler.finalize()) + } + + fn move_left(&mut self, param: NativeType) { + // Section "Parameter Passing" of the Windows x64 calling convention: + // > By default, the x64 calling convention passes the first four arguments to a function in registers. + // > The registers used for these arguments depend on the position and type of the argument. + // > Remaining arguments get pushed on the stack in right-to-left order. + // > [...] + // > Integer valued arguments in the leftmost four positions are passed in left-to-right order in RCX, RDX, R8, and R9 + // > [...] + // > Any floating-point and double-precision arguments in the first four parameters are passed in XMM0 - XMM3, depending on position + let s = &mut self.assmblr; + let param_i = self.params; + + // move each argument one position to the left. The first argument in the stack moves to the last register (r9 or xmm3). + // If the FFI function is called with a new stack frame, the arguments remaining in the stack are copied to the new stack frame. + // Otherwise, they are copied 8 bytes lower in the same frame + match (param_i, param.into()) { + // Section "Parameter Passing" of the Windows x64 calling convention: + // > All integer arguments in registers are right-justified, so the callee can ignore the upper bits of the register + // > and access only the portion of the register necessary. + // (i.e. unlike in SysV or Aarch64-Apple, 8/16 bit integers are not expected to be zero/sign extended) + (0, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov ecx, edx), + (0, Int(U(QW) | I(QW))) => x64!(s; mov rcx, rdx), + // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray struct + // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200 + // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823 + (0, Int(Buffer)) => x64!(s; mov rcx, [rdx + 8]), + // Use movaps for singles and doubles, benefits of smaller encoding outweigh those of using the correct instruction for the type, + // which for doubles should technically be movapd + (0, Float(_)) => { + x64!(s; movaps xmm0, xmm1); + self.zero_first_arg(); + } + + (1, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov edx, r8d), + (1, Int(U(QW) | I(QW))) => x64!(s; mov rdx, r8), + (1, Int(Buffer)) => x64!(s; mov rdx, [r8 + 8]), + (1, Float(_)) => x64!(s; movaps xmm1, xmm2), + + (2, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov r8d, r9d), + (2, Int(U(QW) | I(QW))) => x64!(s; mov r8, r9), + (2, Int(Buffer)) => x64!(s; mov r8, [r9 + 8]), + (2, Float(_)) => x64!(s; movaps xmm2, xmm3), + + (3, param) => { + let ot = self.offset_trampoline as i32; + match param { + Int(U(B | W | DW) | I(B | W | DW)) => { + x64!(s; mov r9d, [rsp + ot]) + } + Int(U(QW) | I(QW)) => { + x64!(s; mov r9, [rsp + ot]) + } + Int(Buffer) => { + x64!(s + ; mov r9, [rsp + ot] + ; mov r9, [r9 + 8]) + } + Float(_) => { + // parameter 4 is always 16-byte aligned, so we can use movaps instead of movups + x64!(s; movaps xmm3, [rsp + ot]) + } + } + // Section "x64 Aggregate and Union layout" of the windows x64 software conventions doc: + // > The alignment of the beginning of a structure or a union is the maximum alignment of any individual member + // Ref: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-software-conventions.md#x64-aggregate-and-union-layout + self.offset_trampoline += 8; + } + (4.., param) => { + let ot = self.offset_trampoline as i32; + let oc = self.offset_callee as i32; + match param { + Int(U(B | W | DW) | I(B | W | DW)) => { + x64!(s + ; mov eax, [rsp + ot] + ; mov [rsp + oc], eax + ) + } + Int(U(QW) | I(QW)) => { + x64!(s + ; mov rax, [rsp + ot] + ; mov [rsp + oc], rax + ) + } + Int(Buffer) => { + x64!(s + ; mov rax, [rsp + ot] + ; mov rax, [rax + 8] + ; mov [rsp + oc], rax + ) + } + Float(_) => { + x64!(s + ; movups xmm4, [rsp + ot] + ; movups [rsp + oc], xmm4 + ) + } + } + // Section "x64 Aggregate and Union layout" of the windows x64 software conventions doc: + // > The alignment of the beginning of a structure or a union is the maximum alignment of any individual member + // Ref: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-software-conventions.md#x64-aggregate-and-union-layout + self.offset_trampoline += 8; + self.offset_callee += 8; + + debug_assert!( + self.allocated_stack == 0 + || self.offset_callee <= self.allocated_stack + ); + } + } + self.params += 1; + } + + fn zero_first_arg(&mut self) { + debug_assert!( + self.params == 0, + "the trampoline would zero the first argument after having overridden it with the second one" + ); + x64!(self.assmblr; xor ecx, ecx); + } + + fn cast_return_value(&mut self, rv: NativeType) { + let s = &mut self.assmblr; + // V8 only supports 32bit integers. We support 8 and 16 bit integers casting them to 32bits. + // Section "Return Values" of the Windows x64 Calling Convention doc: + // > The state of unused bits in the value returned in RAX or XMM0 is undefined. + match rv { + NativeType::U8 => x64!(s; movzx eax, al), + NativeType::I8 => x64!(s; movsx eax, al), + NativeType::U16 => x64!(s; movzx eax, ax), + NativeType::I16 => x64!(s; movsx eax, ax), + _ => (), + } + } + + fn save_out_array_to_preserved_register(&mut self) { + let s = &mut self.assmblr; + // functions returning 64 bit integers have the out array appended as their last parameter, + // and it is a *FastApiTypedArray + match self.params { + // rcx is always V8 receiver + 0 => x64!(s; mov rbx, [rdx + 8]), + 1 => x64!(s; mov rbx, [r8 + 8]), + 2 => x64!(s; mov rbx, [r9 + 8]), + 3.. => { + x64!(s + ; mov rax, [rsp + self.offset_trampoline as i32] + ; mov rbx, [rax + 8] + ) + } + } + } + + fn wrap_return_value_in_out_array(&mut self) { + x64!(self.assmblr; mov [rbx], rax) + } + + fn save_preserved_register_to_stack(&mut self) { + x64!(self.assmblr; push rbx); + self.offset_trampoline += 8; + // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack + self.offset_callee = 0; + self.frame_pointer += 8; + } + + fn recover_preserved_register(&mut self) { + debug_assert!( + self.frame_pointer >= 8, + "the trampoline would try to pop from the stack beyond its frame pointer" + ); + x64!(self.assmblr; pop rbx); + self.frame_pointer -= 8; + // parameter offsets are invalid once this method is called + } + + fn allocate_stack(&mut self, params: &[NativeType]) { + let mut stack_size = 0; + // Section "Calling Convention Defaults" of the x64-calling-convention and Section "Stack Allocation" of the stack-usage docs: + // > The x64 Application Binary Interface (ABI) uses a four-register fast-call calling convention by default. + // > Space is allocated on the call stack as a shadow store for callees to save those registers. + // > [...] + // > Any parameters beyond the first four must be stored on the stack after the shadow store before the call + // > [...] + // > Even if the called function has fewer than 4 parameters, these 4 stack locations are effectively owned by the called function, + // > and may be used by the called function for other purposes besides saving parameter register values + stack_size += max(params.len() as u32, 4) * 8; + + // Align new stack frame (accounting for the 8 byte of the trampoline caller's return address + // and any other potential addition to the stack prior to this allocation) + // Section "Stack Allocation" of stack-usage docs: + // > The stack will always be maintained 16-byte aligned, except within the prolog (for example, after the return address is pushed) + stack_size += padding_to_align(16, self.frame_pointer + stack_size + 8); + + x64!(self.assmblr; sub rsp, stack_size as i32); + self.offset_trampoline += stack_size; + // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack right after the shadow space + self.offset_callee = 32; + self.allocated_stack += stack_size; + self.frame_pointer += stack_size; + } + + fn deallocate_stack(&mut self) { + debug_assert!( + self.frame_pointer >= self.allocated_stack, + "the trampoline would try to deallocate stack beyond its frame pointer" + ); + x64!(self.assmblr; add rsp, self.allocated_stack as i32); + self.frame_pointer -= self.allocated_stack; + self.allocated_stack = 0; + } + + fn call(&mut self, ptr: *const c_void) { + // the stack has been aligned during stack allocation and/or pushing of preserved registers + debug_assert!( + (8 + self.frame_pointer) % 16 == 0, + "the trampoline would call the FFI function with an unaligned stack" + ); + x64!(self.assmblr + ; mov rax, QWORD ptr as _ + ; call rax + ); + } + + fn tailcall(&mut self, ptr: *const c_void) { + // stack pointer is never modified and remains aligned + // return address remains the one provided by the trampoline's caller (V8) + debug_assert!( + self.allocated_stack == 0, + "the trampoline would tail call the FFI function with an outstanding stack allocation" + ); + debug_assert!( + self.frame_pointer == 0, + "the trampoline would tail call the FFI function with outstanding locals in the frame" + ); + x64!(self.assmblr + ; mov rax, QWORD ptr as _ + ; jmp rax + ); + } + + fn ret(&mut self) { + debug_assert!( + self.allocated_stack == 0, + "the trampoline would return with an outstanding stack allocation" + ); + debug_assert!( + self.frame_pointer == 0, + "the trampoline would return with outstanding locals in the frame" + ); + x64!(self.assmblr; ret); + } + + fn is_recv_arg_overridden(&self) -> bool { + self.params > 0 + } + + fn must_cast_return_value(&self, rv: NativeType) -> bool { + // V8 only supports i32 and u32 return types for integers + // We support 8 and 16 bit integers by extending them to 32 bits in the trampoline before returning + matches!( + rv, + NativeType::U8 | NativeType::I8 | NativeType::U16 | NativeType::I16 + ) + } + + fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool { + // V8 only supports i32 and u32 return types for integers + // We support 64 bit integers by wrapping them in a TypedArray out parameter + crate::needs_unwrap(rv) + } + + fn finalize(self) -> ExecutableBuffer { + self.assmblr.finalize().unwrap() + } +} + +fn padding_to_align(alignment: u32, size: u32) -> u32 { + (alignment - size % alignment) % alignment +} + +#[derive(Clone, Copy, Debug)] +enum Floating { + Single = 4, + Double = 8, +} + +impl Floating { + fn size(self) -> u32 { + self as u32 + } +} + +use Floating::*; + +#[derive(Clone, Copy, Debug)] +enum Integral { + I(Size), + U(Size), + Buffer, +} + +impl Integral { + fn size(self) -> u32 { + match self { + I(size) | U(size) => size as u32, + Buffer => 8, + } + } +} + +use Integral::*; + +#[derive(Clone, Copy, Debug)] +enum Size { + B = 1, + W = 2, + DW = 4, + QW = 8, +} +use Size::*; + +#[allow(clippy::enum_variant_names)] +#[derive(Clone, Copy, Debug)] +enum Param { + Int(Integral), + Float(Floating), +} + +use Param::*; + +impl From for Param { + fn from(native: NativeType) -> Self { + match native { + NativeType::F32 => Float(Single), + NativeType::F64 => Float(Double), + NativeType::Bool | NativeType::U8 => Int(U(B)), + NativeType::U16 => Int(U(W)), + NativeType::U32 | NativeType::Void => Int(U(DW)), + NativeType::U64 + | NativeType::USize + | NativeType::Pointer + | NativeType::Function => Int(U(QW)), + NativeType::I8 => Int(I(B)), + NativeType::I16 => Int(I(W)), + NativeType::I32 => Int(I(DW)), + NativeType::I64 | NativeType::ISize => Int(I(QW)), + NativeType::Buffer => Int(Buffer), + } + } +} + +#[cfg(test)] +mod tests { + use std::ptr::null_mut; + + use libffi::middle::Type; + + use crate::NativeType; + use crate::Symbol; + + fn symbol(parameters: Vec, ret: NativeType) -> Symbol { + Symbol { + cif: libffi::middle::Cif::new(vec![], Type::void()), + ptr: libffi::middle::CodePtr(null_mut()), + parameter_types: parameters, + result_type: ret, + can_callback: false, + } + } + + mod sysv_amd64 { + use std::ops::Deref; + + use dynasmrt::dynasm; + use dynasmrt::DynasmApi; + + use super::super::SysVAmd64; + use super::symbol; + use crate::NativeType::*; + + #[test] + fn tailcall() { + let trampoline = SysVAmd64::compile(&symbol( + vec![ + U8, U16, I16, I8, U32, U64, Buffer, Function, I64, I32, I16, I8, F32, + F32, F32, F32, F64, F64, F64, F64, F32, F64, + ], + Void, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/KE9x1h9xq + dynasm!(assembler + ; .arch x64 + ; movzx edi, sil // u8 + ; movzx esi, dx // u16 + ; movsx edx, cx // i16 + ; movsx ecx, r8b // i8 + ; mov r8d, r9d // u32 + ; mov r9, [DWORD rsp + 8] // u64 + ; mov rax, [DWORD rsp + 16] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 8], rax // .. + ; mov rax, [DWORD rsp + 24] // Function + ; mov [DWORD rsp + 16], rax // .. + ; mov rax, [DWORD rsp + 32] // i64 + ; mov [DWORD rsp + 24], rax // .. + ; mov eax, [DWORD rsp + 40] // i32 + ; mov [DWORD rsp + 32], eax // .. + ; movsx eax, WORD [DWORD rsp + 48] // i16 + ; mov [DWORD rsp + 40], eax // .. + ; movsx eax, BYTE [DWORD rsp + 56] // i8 + ; mov [DWORD rsp + 48], eax // .. + ; movss xmm8, [DWORD rsp + 64] // f32 + ; movss [DWORD rsp + 56], xmm8 // .. + ; movsd xmm8, [DWORD rsp + 72] // f64 + ; movsd [DWORD rsp + 64], xmm8 // .. + ; mov rax, QWORD 0 + ; jmp rax + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn integer_casting() { + let trampoline = SysVAmd64::compile(&symbol( + vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16], + I8, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/qo59bPsfv + dynasm!(assembler + ; .arch x64 + ; sub rsp, DWORD 56 // stack allocation + ; movzx edi, sil // u8 + ; movzx esi, dx // u16 + ; movsx edx, cl // i8 + ; movsx ecx, r8w // i16 + ; movzx r8d, r9b // u8 + ; movzx r9d, WORD [DWORD rsp + 64] // u16 + ; movsx eax, BYTE [DWORD rsp + 72] // i8 + ; mov [DWORD rsp + 0], eax // .. + ; movsx eax, WORD [DWORD rsp + 80] // i16 + ; mov [DWORD rsp + 8], eax // .. + ; movzx eax, BYTE [DWORD rsp + 88] // u8 + ; mov [DWORD rsp + 16], eax // .. + ; movzx eax, WORD [DWORD rsp + 96] // u16 + ; mov [DWORD rsp + 24], eax // .. + ; movsx eax, BYTE [DWORD rsp + 104] // i8 + ; mov [DWORD rsp + 32], eax // .. + ; movsx eax, WORD [DWORD rsp + 112] // i16 + ; mov [DWORD rsp + 40], eax // .. + ; mov rax, QWORD 0 + ; call rax + ; movsx eax, al // return value cast + ; add rsp, DWORD 56 // stack deallocation + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn buffer_parameters() { + let trampoline = SysVAmd64::compile(&symbol( + vec![ + Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, + ], + Void, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/hqv63M3Ko + dynasm!(assembler + ; .arch x64 + ; mov rdi, [rsi + 8] // Buffer + ; mov rsi, [rdx + 8] // Buffer + ; mov rdx, [rcx + 8] // Buffer + ; mov rcx, [r8 + 8] // Buffer + ; mov r8, [r9 + 8] // Buffer + ; mov r9, [DWORD rsp + 8] // Buffer + ; mov r9, [r9 + 8] // .. + ; mov rax, [DWORD rsp + 16] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 8], rax // .. + ; mov rax, [DWORD rsp + 24] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 16], rax // .. + ; mov rax, QWORD 0 + ; jmp rax + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_register_typed_array() { + let trampoline = SysVAmd64::compile(&symbol(vec![], U64)); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/8G7a488o7 + dynasm!(assembler + ; .arch x64 + ; push rbx + ; xor edi, edi // recv + ; mov rbx, [rsi + 8] // save data array pointer to non-volatile register + ; mov rax, QWORD 0 + ; call rax + ; mov [rbx], rax // copy return value to data pointer address + ; pop rbx + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_stack_typed_array() { + let trampoline = SysVAmd64::compile(&symbol( + vec![U64, U64, U64, U64, U64, U64, U64], + U64, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/cPnPYWdWq + dynasm!(assembler + ; .arch x64 + ; push rbx + ; sub rsp, DWORD 16 + ; mov rdi, rsi // u64 + ; mov rsi, rdx // u64 + ; mov rdx, rcx // u64 + ; mov rcx, r8 // u64 + ; mov r8, r9 // u64 + ; mov r9, [DWORD rsp + 32] // u64 + ; mov rax, [DWORD rsp + 40] // u64 + ; mov [DWORD rsp + 0], rax // .. + ; mov rax, [DWORD rsp + 48] // save data array pointer to non-volatile register + ; mov rbx, [rax + 8] // .. + ; mov rax, QWORD 0 + ; call rax + ; mov [rbx], rax // copy return value to data pointer address + ; add rsp, DWORD 16 + ; pop rbx + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + } + + mod aarch64_apple { + use std::ops::Deref; + + use dynasmrt::dynasm; + + use super::super::Aarch64Apple; + use super::symbol; + use crate::NativeType::*; + + #[test] + fn tailcall() { + let trampoline = Aarch64Apple::compile(&symbol( + vec![ + U8, U16, I16, I8, U32, U64, Buffer, Function, I64, I32, I16, I8, F32, + F32, F32, F32, F64, F64, F64, F64, F32, F64, + ], + Void, + )); + + let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap(); + // See https://godbolt.org/z/oefqYWT13 + dynasm!(assembler + ; .arch aarch64 + ; and w0, w1, 0xFF // u8 + ; and w1, w2, 0xFFFF // u16 + ; sxth w2, w3 // i16 + ; sxtb w3, w4 // i8 + ; mov w4, w5 // u32 + ; mov x5, x6 // u64 + ; ldr x6, [x7, 8] // Buffer + ; ldr x7, [sp] // Function + ; ldr x8, [sp, 8] // i64 + ; str x8, [sp] // .. + ; ldr w8, [sp, 16] // i32 + ; str w8, [sp, 8] // .. + ; ldr w8, [sp, 24] // i16 + ; strh w8, [sp, 12] // .. + ; ldr w8, [sp, 32] // i8 + ; strb w8, [sp, 14] // .. + ; ldr s16, [sp, 40] // f32 + ; str s16, [sp, 16] // .. + ; ldr d16, [sp, 48] // f64 + ; str d16, [sp, 24] // .. + ; movz x8, 0 + ; br x8 + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn integer_casting() { + let trampoline = Aarch64Apple::compile(&symbol( + vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16], + I8, + )); + + let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap(); + // See https://godbolt.org/z/7qfzbzobM + dynasm!(assembler + ; .arch aarch64 + ; and w0, w1, 0xFF // u8 + ; and w1, w2, 0xFFFF // u16 + ; sxtb w2, w3 // i8 + ; sxth w3, w4 // i16 + ; and w4, w5, 0xFF // u8 + ; and w5, w6, 0xFFFF // u16 + ; sxtb w6, w7 // i8 + ; ldrsh w7, [sp] // i16 + ; ldr w8, [sp, 8] // u8 + ; strb w8, [sp] // .. + ; ldr w8, [sp, 16] // u16 + ; strh w8, [sp, 2] // .. + ; ldr w8, [sp, 24] // i8 + ; strb w8, [sp, 4] // .. + ; ldr w8, [sp, 32] // i16 + ; strh w8, [sp, 6] // .. + ; movz x8, 0 + ; br x8 + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn buffer_parameters() { + let trampoline = Aarch64Apple::compile(&symbol( + vec![ + Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, + Buffer, Buffer, + ], + Void, + )); + + let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap(); + // See https://godbolt.org/z/obd6z6vsf + dynasm!(assembler + ; .arch aarch64 + ; ldr x0, [x1, 8] // Buffer + ; ldr x1, [x2, 8] // Buffer + ; ldr x2, [x3, 8] // Buffer + ; ldr x3, [x4, 8] // Buffer + ; ldr x4, [x5, 8] // Buffer + ; ldr x5, [x6, 8] // Buffer + ; ldr x6, [x7, 8] // Buffer + ; ldr x7, [sp] // Buffer + ; ldr x7, [x7, 8] // .. + ; ldr x8, [sp, 8] // Buffer + ; ldr x8, [x8, 8] // .. + ; str x8, [sp] // .. + ; ldr x8, [sp, 16] // Buffer + ; ldr x8, [x8, 8] // .. + ; str x8, [sp, 8] // .. + ; movz x8, 0 + ; br x8 + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_register_typed_array() { + let trampoline = Aarch64Apple::compile(&symbol(vec![], U64)); + + let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap(); + // See https://godbolt.org/z/47EvvYb83 + dynasm!(assembler + ; .arch aarch64 + ; sub sp, sp, 32 + ; stp x29, x30, [sp, 16] + ; add x29, sp, 16 + ; str x19, [sp, 8] + ; mov x0, xzr // recv + ; ldr x19, [x1, 8] // save data array pointer to non-volatile register + ; movz x8, 0 + ; blr x8 + ; str x0, [x19] // copy return value to data pointer address + ; ldr x19, [sp, 8] + ; ldp x29, x30, [sp, 16] + ; add sp, sp, 32 + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_stack_typed_array() { + let trampoline = Aarch64Apple::compile(&symbol( + vec![U64, U64, U64, U64, U64, U64, U64, U64, U8, U8], + U64, + )); + + let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap(); + // See https://godbolt.org/z/PvYPbsE1b + dynasm!(assembler + ; .arch aarch64 + ; sub sp, sp, 32 + ; stp x29, x30, [sp, 16] + ; add x29, sp, 16 + ; str x19, [sp, 8] + ; mov x0, x1 // u64 + ; mov x1, x2 // u64 + ; mov x2, x3 // u64 + ; mov x3, x4 // u64 + ; mov x4, x5 // u64 + ; mov x5, x6 // u64 + ; mov x6, x7 // u64 + ; ldr x7, [sp, 32] // u64 + ; ldr w8, [sp, 40] // u8 + ; strb w8, [sp] // .. + ; ldr w8, [sp, 48] // u8 + ; strb w8, [sp, 1] // .. + ; ldr x19, [sp, 56] // save data array pointer to non-volatile register + ; ldr x19, [x19, 8] // .. + ; movz x8, 0 + ; blr x8 + ; str x0, [x19] // copy return value to data pointer address + ; ldr x19, [sp, 8] + ; ldp x29, x30, [sp, 16] + ; add sp, sp, 32 + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + } + + mod x64_windows { + use std::ops::Deref; + + use dynasmrt::{dynasm, DynasmApi}; + + use super::super::Win64; + use super::symbol; + use crate::NativeType::*; + + #[test] + fn tailcall() { + let trampoline = + Win64::compile(&symbol(vec![U8, I16, F64, F32, U32, I8, Buffer], Void)); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/TYzqrf9aj + dynasm!(assembler + ; .arch x64 + ; mov ecx, edx // u8 + ; mov edx, r8d // i16 + ; movaps xmm2, xmm3 // f64 + ; movaps xmm3, [DWORD rsp + 40] // f32 + ; mov eax, [DWORD rsp + 48] // u32 + ; mov [DWORD rsp + 40], eax // .. + ; mov eax, [DWORD rsp + 56] // i8 + ; mov [DWORD rsp + 48], eax // .. + ; mov rax, [DWORD rsp + 64] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 56], rax // .. + ; mov rax, QWORD 0 + ; jmp rax + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn integer_casting() { + let trampoline = Win64::compile(&symbol( + vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16], + I8, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/KMx56KGTq + dynasm!(assembler + ; .arch x64 + ; sub rsp, DWORD 104 // stack allocation + ; mov ecx, edx // u8 + ; mov edx, r8d // u16 + ; mov r8d, r9d // i8 + ; mov r9d, [DWORD rsp + 144] // i16 + ; mov eax, [DWORD rsp + 152] // u8 + ; mov [DWORD rsp + 32], eax // .. + ; mov eax, [DWORD rsp + 160] // u16 + ; mov [DWORD rsp + 40], eax // u16 + ; mov eax, [DWORD rsp + 168] // i8 + ; mov [DWORD rsp + 48], eax // .. + ; mov eax, [DWORD rsp + 176] // i16 + ; mov [DWORD rsp + 56], eax // .. + ; mov eax, [DWORD rsp + 184] // u8 + ; mov [DWORD rsp + 64], eax // .. + ; mov eax, [DWORD rsp + 192] // u16 + ; mov [DWORD rsp + 72], eax // .. + ; mov eax, [DWORD rsp + 200] // i8 + ; mov [DWORD rsp + 80], eax // .. + ; mov eax, [DWORD rsp + 208] // i16 + ; mov [DWORD rsp + 88], eax // .. + ; mov rax, QWORD 0 + ; call rax + ; movsx eax, al // return value cast + ; add rsp, DWORD 104 // stack deallocation + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn buffer_parameters() { + let trampoline = Win64::compile(&symbol( + vec![Buffer, Buffer, Buffer, Buffer, Buffer, Buffer], + Void, + )); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/TYzqrf9aj + dynasm!(assembler + ; .arch x64 + ; mov rcx, [rdx + 8] // Buffer + ; mov rdx, [r8 + 8] // Buffer + ; mov r8, [r9 + 8] // Buffer + ; mov r9, [DWORD rsp + 40] // Buffer + ; mov r9, [r9 + 8] // .. + ; mov rax, [DWORD rsp + 48] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 40], rax // .. + ; mov rax, [DWORD rsp + 56] // Buffer + ; mov rax, [rax + 8] // .. + ; mov [DWORD rsp + 48], rax // .. + ; mov rax, QWORD 0 + ; jmp rax + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_register_typed_array() { + let trampoline = Win64::compile(&symbol(vec![], U64)); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/7EnPE7o3T + dynasm!(assembler + ; .arch x64 + ; push rbx + ; sub rsp, DWORD 32 + ; xor ecx, ecx // recv + ; mov rbx, [rdx + 8] // save data array pointer to non-volatile register + ; mov rax, QWORD 0 + ; call rax + ; mov [rbx], rax // copy return value to data pointer address + ; add rsp, DWORD 32 + ; pop rbx + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + + #[test] + fn return_u64_in_stack_typed_array() { + let trampoline = + Win64::compile(&symbol(vec![U64, U64, U64, U64, U64], U64)); + + let mut assembler = dynasmrt::x64::Assembler::new().unwrap(); + // See https://godbolt.org/z/3966sfEex + dynasm!(assembler + ; .arch x64 + ; push rbx + ; sub rsp, DWORD 48 + ; mov rcx, rdx // u64 + ; mov rdx, r8 // u64 + ; mov r8, r9 // u64 + ; mov r9, [DWORD rsp + 96] // u64 + ; mov rax, [DWORD rsp + 104] // u64 + ; mov [DWORD rsp + 32], rax // .. + ; mov rax, [DWORD rsp + 112] // save data array pointer to non-volatile register + ; mov rbx, [rax + 8] // .. + ; mov rax, QWORD 0 + ; call rax + ; mov [rbx], rax // copy return value to data pointer address + ; add rsp, DWORD 48 + ; pop rbx + ; ret + ); + let expected = assembler.finalize().unwrap(); + assert_eq!(trampoline.0.deref(), expected.deref()); + } + } +} diff --git a/ext/ffi/jit_trampoline.rs b/ext/ffi/jit_trampoline.rs deleted file mode 100644 index 6cb8ec74c2..0000000000 --- a/ext/ffi/jit_trampoline.rs +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license. - -use crate::NativeType; -use crate::{tcc::Compiler, Symbol}; -use std::ffi::c_void; -use std::ffi::CString; -use std::fmt::Write as _; -use std::mem::size_of; - -const _: () = assert!(size_of::() == size_of::()); - -pub(crate) struct Allocation { - pub addr: *mut c_void, - _ctx: Compiler, - _sym: Box, -} - -macro_rules! cstr { - ($st:expr) => { - &CString::new($st).unwrap() - }; -} - -fn native_arg_to_c(ty: &NativeType) -> &'static str { - match ty { - NativeType::Bool => "bool", - NativeType::U8 | NativeType::U16 | NativeType::U32 => "uint32_t", - NativeType::I8 | NativeType::I16 | NativeType::I32 => "int32_t", - NativeType::Void => "void", - NativeType::F32 => "float", - NativeType::F64 => "double", - NativeType::U64 => "uint64_t", - NativeType::I64 => "int64_t", - NativeType::ISize => "intptr_t", - NativeType::USize => "uintptr_t", - NativeType::Buffer => "struct FastApiTypedArray*", - NativeType::Function | NativeType::Pointer => "void*", - } -} - -fn native_to_c(ty: &NativeType) -> &'static str { - match ty { - NativeType::Bool => "bool", - NativeType::U8 => "uint8_t", - NativeType::U16 => "uint16_t", - NativeType::U32 => "uint32_t", - NativeType::I8 => "int8_t", - NativeType::I16 => "uint16_t", - NativeType::I32 => "int32_t", - NativeType::Void => "void", - NativeType::F32 => "float", - NativeType::F64 => "double", - NativeType::U64 => "uint64_t", - NativeType::I64 => "int64_t", - NativeType::ISize => "intptr_t", - NativeType::USize => "uintptr_t", - NativeType::Pointer | NativeType::Buffer | NativeType::Function => "void*", - } -} - -pub(crate) fn codegen(sym: &crate::Symbol) -> String { - let mut c = String::from(include_str!("prelude.h")); - let needs_unwrap = crate::needs_unwrap(sym.result_type); - - // Return type of the FFI call. - let ffi_ret = native_to_c(&sym.result_type); - // Return type of the trampoline. - let ret = if needs_unwrap { "void" } else { ffi_ret }; - - // extern func( - let _ = write!(c, "\nextern {ffi_ret} func("); - // p0, p1, ...); - for (i, ty) in sym.parameter_types.iter().enumerate() { - if i > 0 { - c += ", "; - } - c += native_to_c(ty); - let _ = write!(c, " p{i}"); - } - c += ");\n\n"; - - // void* recv, p0, p1, ...); - c += ret; - c += " func_trampoline("; - c += "void* recv"; - for (i, ty) in sym.parameter_types.iter().enumerate() { - c += ", "; - c += native_arg_to_c(ty); - let _ = write!(c, " p{i}"); - } - if needs_unwrap { - let _ = write!(c, ", struct FastApiTypedArray* const p_ret"); - } - c += ") {\n"; - // func(p0, p1, ...); - let mut call_s = String::from("func("); - { - for (i, ty) in sym.parameter_types.iter().enumerate() { - if i > 0 { - call_s += ", "; - } - if matches!(ty, NativeType::Buffer) { - let _ = write!(call_s, "p{i}->data"); - } else { - let _ = write!(call_s, "p{i}"); - } - } - call_s += ");\n"; - } - if needs_unwrap { - // r = func(p0, p1, ...); - // ((*)p_ret->data)[0] = r; - let _ = write!(c, " {ffi_ret} r = {call_s}"); - let _ = writeln!(c, " (({ffi_ret}*)p_ret->data)[0] = r;"); - } else { - // return func(p0, p1, ...); - let _ = write!(c, " return {call_s}"); - } - c += "}\n\n"; - c -} - -pub(crate) fn gen_trampoline( - sym: Box, -) -> Result, ()> { - let mut ctx = Compiler::new()?; - ctx.set_options(cstr!("-nostdlib")); - // SAFETY: symbol satisfies ABI requirement. - unsafe { ctx.add_symbol(cstr!("func"), sym.ptr.0 as *const c_void) }; - let c = codegen(&sym); - ctx.compile_string(cstr!(c))?; - let alloc = Allocation { - addr: ctx.relocate_and_get_symbol(cstr!("func_trampoline"))?, - _ctx: ctx, - _sym: sym, - }; - Ok(Box::new(alloc)) -} - -#[cfg(test)] -mod tests { - use super::*; - use libffi::middle::Type; - use std::ptr::null_mut; - - fn codegen(parameters: Vec, ret: NativeType) -> String { - let sym = Box::new(crate::Symbol { - cif: libffi::middle::Cif::new(vec![], Type::void()), - ptr: libffi::middle::CodePtr(null_mut()), - parameter_types: parameters, - result_type: ret, - can_callback: false, - }); - super::codegen(&sym) - } - - const PRELUDE: &str = include_str!("prelude.h"); - fn assert_codegen(expected: String, actual: &str) { - assert_eq!(expected, format!("{PRELUDE}\n{}", actual)) - } - - #[test] - fn test_gen_trampoline() { - assert_codegen( - codegen(vec![], NativeType::Void), - "extern void func();\n\n\ - void func_trampoline(void* recv) {\ - \n return func();\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::U32, NativeType::U32], NativeType::U32), - "extern uint32_t func(uint32_t p0, uint32_t p1);\n\n\ - uint32_t func_trampoline(void* recv, uint32_t p0, uint32_t p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::I32, NativeType::I32], NativeType::I32), - "extern int32_t func(int32_t p0, int32_t p1);\n\n\ - int32_t func_trampoline(void* recv, int32_t p0, int32_t p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::F32, NativeType::F32], NativeType::F32), - "extern float func(float p0, float p1);\n\n\ - float func_trampoline(void* recv, float p0, float p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::F64, NativeType::F64], NativeType::F64), - "extern double func(double p0, double p1);\n\n\ - double func_trampoline(void* recv, double p0, double p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::Buffer, NativeType::U32], NativeType::U32), - "extern uint32_t func(void* p0, uint32_t p1);\n\n\ - uint32_t func_trampoline(void* recv, struct FastApiTypedArray* p0, uint32_t p1) {\ - \n return func(p0->data, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::Buffer, NativeType::Buffer], NativeType::U32), - "extern uint32_t func(void* p0, void* p1);\n\n\ - uint32_t func_trampoline(void* recv, struct FastApiTypedArray* p0, struct FastApiTypedArray* p1) {\ - \n return func(p0->data, p1->data);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![], NativeType::U64), - "extern uint64_t func();\n\n\ - void func_trampoline(void* recv, struct FastApiTypedArray* const p_ret) {\ - \n uint64_t r = func();\ - \n ((uint64_t*)p_ret->data)[0] = r;\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::Buffer, NativeType::Buffer], NativeType::U64), - "extern uint64_t func(void* p0, void* p1);\n\n\ - void func_trampoline(void* recv, struct FastApiTypedArray* p0, struct FastApiTypedArray* p1, struct FastApiTypedArray* const p_ret) {\ - \n uint64_t r = func(p0->data, p1->data);\ - \n ((uint64_t*)p_ret->data)[0] = r;\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::Pointer, NativeType::Pointer], NativeType::U64), - "extern uint64_t func(void* p0, void* p1);\n\n\ - void func_trampoline(void* recv, void* p0, void* p1, struct FastApiTypedArray* const p_ret) {\ - \n uint64_t r = func(p0, p1);\ - \n ((uint64_t*)p_ret->data)[0] = r;\n\ - }\n\n", - ); - } - - #[test] - fn test_gen_trampoline_implicit_cast() { - assert_codegen( - codegen(vec![NativeType::I8, NativeType::U8], NativeType::I8), - "extern int8_t func(int8_t p0, uint8_t p1);\n\n\ - int8_t func_trampoline(void* recv, int32_t p0, uint32_t p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::ISize, NativeType::U64], NativeType::Void), - "extern void func(intptr_t p0, uint64_t p1);\n\n\ - void func_trampoline(void* recv, intptr_t p0, uint64_t p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - assert_codegen( - codegen(vec![NativeType::USize, NativeType::USize], NativeType::U32), - "extern uint32_t func(uintptr_t p0, uintptr_t p1);\n\n\ - uint32_t func_trampoline(void* recv, uintptr_t p0, uintptr_t p1) {\ - \n return func(p0, p1);\n\ - }\n\n", - ); - } -} diff --git a/ext/ffi/lib.rs b/ext/ffi/lib.rs index ead9be2839..b93638c886 100644 --- a/ext/ffi/lib.rs +++ b/ext/ffi/lib.rs @@ -14,7 +14,6 @@ use deno_core::serde_json::json; use deno_core::serde_json::Value; use deno_core::serde_v8; use deno_core::v8; -use deno_core::v8::fast_api; use deno_core::Extension; use deno_core::OpState; use deno_core::Resource; @@ -39,15 +38,11 @@ use std::ptr; use std::rc::Rc; use std::sync::mpsc::sync_channel; -#[cfg(not(target_os = "windows"))] -mod jit_trampoline; -#[cfg(not(target_os = "windows"))] -mod tcc; +mod fast_call; #[cfg(not(target_pointer_width = "64"))] compile_error!("platform not supported"); -// Assert assumptions made in `prelude.h` const _: () = { assert!(size_of::() == 1); assert!(size_of::() == 2); @@ -90,8 +85,6 @@ struct Symbol { ptr: libffi::middle::CodePtr, parameter_types: Vec, result_type: NativeType, - // This is dead code only on Windows - #[allow(dead_code)] can_callback: bool, } @@ -729,50 +722,6 @@ where )) } -pub struct FfiFastCallTemplate { - args: Box<[fast_api::Type]>, - ret: fast_api::CType, - symbol_ptr: *const c_void, -} - -impl fast_api::FastFunction for FfiFastCallTemplate { - fn function(&self) -> *const c_void { - self.symbol_ptr - } - - fn args(&self) -> &'static [fast_api::Type] { - Box::leak(self.args.clone()) - } - - fn return_type(&self) -> fast_api::CType { - self.ret - } -} - -impl From<&NativeType> for fast_api::Type { - fn from(native_type: &NativeType) -> Self { - match native_type { - NativeType::Bool => fast_api::Type::Bool, - NativeType::U8 | NativeType::U16 | NativeType::U32 => { - fast_api::Type::Uint32 - } - NativeType::I8 | NativeType::I16 | NativeType::I32 => { - fast_api::Type::Int32 - } - NativeType::F32 => fast_api::Type::Float32, - NativeType::F64 => fast_api::Type::Float64, - NativeType::Void => fast_api::Type::Void, - NativeType::I64 => fast_api::Type::Int64, - NativeType::U64 => fast_api::Type::Uint64, - NativeType::ISize => fast_api::Type::Int64, - NativeType::USize | NativeType::Pointer | NativeType::Function => { - fast_api::Type::Uint64 - } - NativeType::Buffer => fast_api::Type::TypedArray(fast_api::CType::Uint8), - } - } -} - fn needs_unwrap(rv: NativeType) -> bool { matches!( rv, @@ -796,42 +745,6 @@ fn make_sync_fn<'s>( scope: &mut v8::HandleScope<'s>, sym: Box, ) -> v8::Local<'s, v8::Function> { - #[cfg(not(target_os = "windows"))] - let mut fast_ffi_templ: Option = None; - - #[cfg(target_os = "windows")] - let fast_ffi_templ: Option = None; - - #[cfg(not(target_os = "windows"))] - let mut fast_allocations: Option<*mut ()> = None; - #[cfg(not(target_os = "windows"))] - if !sym.can_callback { - let needs_unwrap = needs_unwrap(sym.result_type); - let ret = match needs_unwrap { - true => fast_api::Type::Void, - false => fast_api::Type::from(&sym.result_type), - }; - - let mut args = sym - .parameter_types - .iter() - .map(|t| t.into()) - .collect::>(); - // recv - args.insert(0, fast_api::Type::V8Value); - if needs_unwrap { - args.push(fast_api::Type::TypedArray(fast_api::CType::Int32)); - } - let symbol_trampoline = - jit_trampoline::gen_trampoline(sym.clone()).expect("gen_trampoline"); - fast_ffi_templ = Some(FfiFastCallTemplate { - args: args.into_boxed_slice(), - ret: (&ret).into(), - symbol_ptr: symbol_trampoline.addr, - }); - fast_allocations = Some(Box::into_raw(symbol_trampoline) as *mut ()); - } - let sym = Box::leak(sym); let builder = v8::FunctionTemplate::builder( |scope: &mut v8::HandleScope, @@ -891,8 +804,17 @@ fn make_sync_fn<'s>( ) .data(v8::External::new(scope, sym as *mut Symbol as *mut _).into()); - let func = if let Some(fast_ffi_templ) = fast_ffi_templ { - builder.build_fast(scope, &fast_ffi_templ, None) + let mut fast_call_alloc = None; + + let func = if fast_call::is_compatible(sym) { + let trampoline = fast_call::compile_trampoline(sym); + let func = builder.build_fast( + scope, + &fast_call::make_template(sym, &trampoline), + None, + ); + fast_call_alloc = Some(Box::into_raw(Box::new(trampoline))); + func } else { builder.build(scope) }; @@ -904,12 +826,12 @@ fn make_sync_fn<'s>( Box::new(move |_| { // SAFETY: This is never called twice. pointer obtained // from Box::into_raw, hence, satisfies memory layout requirements. - unsafe { - Box::from_raw(sym); - #[cfg(not(target_os = "windows"))] - if let Some(fast_allocations) = fast_allocations { - Box::from_raw(fast_allocations as *mut jit_trampoline::Allocation); - } + let _ = unsafe { Box::from_raw(sym) }; + if let Some(fast_call_ptr) = fast_call_alloc { + // fast-call compiled trampoline is unmapped when the MMAP handle is dropped + // SAFETY: This is never called twice. pointer obtained + // from Box::into_raw, hence, satisfies memory layout requirements. + let _ = unsafe { Box::from_raw(fast_call_ptr) }; } }), ); diff --git a/ext/ffi/prelude.h b/ext/ffi/prelude.h deleted file mode 100644 index 2da1e65238..0000000000 --- a/ext/ffi/prelude.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license. - -/* Boolean type */ - -#ifndef _STDBOOL_H -#define _STDBOOL_H - -#define bool _Bool -#define true 1 -#define false 0 - -#endif - -/* Exact integral types. */ - -/* Signed. */ -typedef signed char int8_t; -typedef short int int16_t; -typedef int int32_t; -typedef long int int64_t; - -/* Unsigned. */ -typedef unsigned char uint8_t; -typedef unsigned short int uint16_t; -typedef unsigned int uint32_t; -typedef unsigned long int uint64_t; - -/* Types for `void *' pointers. */ -typedef long int intptr_t; -typedef unsigned long int uintptr_t; - -// https://source.chromium.org/chromium/chromium/src/+/main:v8/include/v8-fast-api-calls.h;l=336 -struct FastApiTypedArray { - uintptr_t length_; - void* data; -}; diff --git a/ext/ffi/tcc.rs b/ext/ffi/tcc.rs deleted file mode 100644 index de7c719602..0000000000 --- a/ext/ffi/tcc.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license. - -use std::{ - ffi::CStr, - marker::PhantomData, - os::raw::{c_char, c_int, c_void}, - ptr::null_mut, -}; - -#[repr(C)] -#[derive(Debug)] -pub struct TCCState { - _unused: [u8; 0], -} -pub const TCC_OUTPUT_MEMORY: i32 = 1; - -extern "C" { - pub fn tcc_new() -> *mut TCCState; - pub fn tcc_delete(s: *mut TCCState); - pub fn tcc_set_options(s: *mut TCCState, str: *const c_char); - pub fn tcc_compile_string(s: *mut TCCState, buf: *const c_char) -> c_int; - pub fn tcc_add_symbol( - s: *mut TCCState, - name: *const c_char, - val: *const c_void, - ) -> c_int; - pub fn tcc_set_output_type(s: *mut TCCState, output_type: c_int) -> c_int; - pub fn tcc_relocate(s1: *mut TCCState, ptr: *mut c_void) -> c_int; - pub fn tcc_get_symbol(s: *mut TCCState, name: *const c_char) -> *mut c_void; -} - -/// Compilation context. -pub struct Compiler { - inner: *mut TCCState, - _phantom: PhantomData, - pub bin: Option>, -} - -impl Compiler { - pub fn new() -> Result { - // SAFETY: There is one context per thread. - let inner = unsafe { tcc_new() }; - if inner.is_null() { - Err(()) - } else { - let ret = - // SAFETY: set output to memory. - unsafe { tcc_set_output_type(inner, TCC_OUTPUT_MEMORY as c_int) }; - assert_eq!(ret, 0); - Ok(Self { - inner, - _phantom: PhantomData, - bin: None, - }) - } - } - - pub fn set_options(&mut self, option: &CStr) -> &mut Self { - // SAFETY: option is a null-terminated C string. - unsafe { - tcc_set_options(self.inner, option.as_ptr()); - } - self - } - - pub fn compile_string(&mut self, p: &CStr) -> Result<(), ()> { - // SAFETY: p is a null-terminated C string. - let ret = unsafe { tcc_compile_string(self.inner, p.as_ptr()) }; - if ret == 0 { - Ok(()) - } else { - Err(()) - } - } - - /// # Safety - /// Symbol need satisfy ABI requirement. - pub unsafe fn add_symbol(&mut self, sym: &CStr, val: *const c_void) { - // SAFETY: sym is a null-terminated C string. - let ret = tcc_add_symbol(self.inner, sym.as_ptr(), val); - assert_eq!(ret, 0); - } - - pub fn relocate_and_get_symbol( - &mut self, - sym: &CStr, - ) -> Result<*mut c_void, ()> { - // SAFETY: pass null ptr to get required length - let len = unsafe { tcc_relocate(self.inner, null_mut()) }; - if len == -1 { - return Err(()); - }; - let mut bin = Vec::with_capacity(len as usize); - let ret = - // SAFETY: bin is allocated up to len. - unsafe { tcc_relocate(self.inner, bin.as_mut_ptr() as *mut c_void) }; - if ret != 0 { - return Err(()); - } - // SAFETY: if ret == 0, bin is initialized. - unsafe { - bin.set_len(len as usize); - } - self.bin = Some(bin); - // SAFETY: sym is a null-terminated C string. - let addr = unsafe { tcc_get_symbol(self.inner, sym.as_ptr()) }; - Ok(addr) - } -} - -impl Drop for Compiler { - fn drop(&mut self) { - // SAFETY: delete state from tcc_new() - unsafe { tcc_delete(self.inner) }; - } -} diff --git a/ext/ffi/tinycc b/ext/ffi/tinycc deleted file mode 160000 index afc136262e..0000000000 --- a/ext/ffi/tinycc +++ /dev/null @@ -1 +0,0 @@ -Subproject commit afc136262e93ae85fb3643005b36dbfc30d99c42 diff --git a/test_ffi/Cargo.toml b/test_ffi/Cargo.toml index cc7708fbc2..38dd86b3ea 100644 --- a/test_ffi/Cargo.toml +++ b/test_ffi/Cargo.toml @@ -11,4 +11,5 @@ publish = false crate-type = ["cdylib"] [dev-dependencies] +pretty_assertions = "1.2.1" test_util = { path = "../test_util" } diff --git a/test_ffi/src/lib.rs b/test_ffi/src/lib.rs index 812b563ef7..be0d2e42f8 100644 --- a/test_ffi/src/lib.rs +++ b/test_ffi/src/lib.rs @@ -258,6 +258,60 @@ pub extern "C" fn call_stored_function_thread_safe_and_log() { }); } +#[no_mangle] +pub extern "C" fn log_many_parameters( + a: u8, + b: u16, + c: u32, + d: u64, + e: f64, + f: f32, + g: i64, + h: i32, + i: i16, + j: i8, + k: isize, + l: usize, + m: f64, + n: f32, + o: f64, + p: f32, + q: f64, + r: f32, + s: f64, +) { + println!("{a} {b} {c} {d} {e} {f} {g} {h} {i} {j} {k} {l} {m} {n} {o} {p} {q} {r} {s}"); +} + +#[no_mangle] +pub extern "C" fn cast_u8_u32(x: u8) -> u32 { + x as u32 +} + +#[no_mangle] +pub extern "C" fn cast_u32_u8(x: u32) -> u8 { + x as u8 +} + +#[no_mangle] +pub extern "C" fn add_many_u16( + a: u16, + b: u16, + c: u16, + d: u16, + e: u16, + f: u16, + g: u16, + h: u16, + i: u16, + j: u16, + k: u16, + l: u16, + m: u16, +) -> u16 { + a + b + c + d + e + f + g + h + i + j + k + l + m +} + // FFI performance helper functions #[no_mangle] pub extern "C" fn nop() {} diff --git a/test_ffi/tests/integration_tests.rs b/test_ffi/tests/integration_tests.rs index 88da8a0b99..6b48534537 100644 --- a/test_ffi/tests/integration_tests.rs +++ b/test_ffi/tests/integration_tests.rs @@ -80,6 +80,10 @@ fn basic() { 579.912\n\ true\n\ false\n\ + 579.9119873046875\n\ + 579.9119873046875\n\ + 579.912\n\ + 579.912\n\ 579\n\ 8589934590\n\ -8589934590\n\ @@ -105,6 +109,14 @@ fn basic() { buf: [1, 2, 3, 4, 5, 6, 7, 8]\n\ logCallback\n\ 30\n\ + 255 65535 4294967295 4294967296 123.456 789.876 -1 -2 -3 -4 -1000 1000 12345.67891 12345.679 12345.67891 12345.679 12345.67891 12345.679 12345.67891\n\ + 255 65535 4294967295 4294967296 123.456 789.876 -1 -2 -3 -4 -1000 1000 12345.67891 12345.679 12345.67891 12345.679 12345.67891 12345.679 12345.67891\n\ + 0\n\ + 0\n\ + 0\n\ + 0\n\ + 78\n\ + 78\n\ STORED_FUNCTION cleared\n\ STORED_FUNCTION_2 cleared\n\ Thread safe call counter: 0\n\ @@ -120,6 +132,8 @@ fn basic() { uint32Array[0]: 42\n\ uint32Array[0] after mutation: 55\n\ Static ptr value after mutation: 55\n\ + 2264956937\n\ + 2264956937\n\ Correct number of resources\n"; assert_eq!(stdout, expected); assert_eq!(stderr, ""); diff --git a/test_ffi/tests/test.js b/test_ffi/tests/test.js index f71f23adc8..6bf3c47f82 100644 --- a/test_ffi/tests/test.js +++ b/test_ffi/tests/test.js @@ -6,6 +6,7 @@ import { assertEquals } from "https://deno.land/std@0.149.0/testing/asserts.ts"; import { assertThrows, + assert, } from "../../test_util/std/testing/asserts.ts"; const targetDir = Deno.execPath().replace(/[^\/\\]+$/, ""); @@ -175,6 +176,22 @@ const dylib = Deno.dlopen(libPath, { result: "void", callback: true, }, + log_many_parameters: { + parameters: ["u8", "u16", "u32", "u64", "f64", "f32", "i64", "i32", "i16", "i8", "isize", "usize", "f64", "f32", "f64", "f32", "f64", "f32", "f64"], + result: "void", + }, + cast_u8_u32: { + parameters: ["u8"], + result: "u32", + }, + cast_u32_u8: { + parameters: ["u32"], + result: "u8", + }, + add_many_u16: { + parameters: ["u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16"], + result: "u16", + }, // Statics "static_u32": { type: "u32", @@ -191,6 +208,7 @@ const dylib = Deno.dlopen(libPath, { "static_char": { type: "pointer", }, + "hash": { parameters: ["buffer", "u32"], result: "u32" }, }); const { symbols } = dylib; @@ -210,11 +228,7 @@ function returnBuffer() { return return_buffer(); }; returnBuffer(); %OptimizeFunctionOnNextCall(returnBuffer); const ptr0 = returnBuffer(); - -const status = %GetOptimizationStatus(returnBuffer); -if (!(status & (1 << 4))) { - throw new Error("returnBuffer is not optimized"); -} +assertIsOptimized(returnBuffer); dylib.symbols.print_pointer(ptr0, 8); const ptrView = new Deno.UnsafePointerView(ptr0); @@ -266,17 +280,10 @@ const { add_u32, add_usize_fast } = symbols; function addU32Fast(a, b) { return add_u32(a, b); }; - -%PrepareFunctionForOptimization(addU32Fast); -console.log(addU32Fast(123, 456)); -%OptimizeFunctionOnNextCall(addU32Fast); -console.log(addU32Fast(123, 456)); +testOptimized(addU32Fast, () => addU32Fast(123, 456)); function addU64Fast(a, b) { return add_usize_fast(a, b); }; -%PrepareFunctionForOptimization(addU64Fast); -console.log(addU64Fast(2, 3)); -%OptimizeFunctionOnNextCall(addU64Fast); -console.log(addU64Fast(2, 3)); +testOptimized(addU64Fast, () => addU64Fast(2, 3)); console.log(dylib.symbols.add_i32(123, 456)); console.log(dylib.symbols.add_u64(0xffffffffn, 0xffffffffn)); @@ -294,6 +301,16 @@ console.log(dylib.symbols.add_f64(123.123, 456.789)); console.log(dylib.symbols.and(true, true)); console.log(dylib.symbols.and(true, false)); +function addF32Fast(a, b) { + return dylib.symbols.add_f32(a, b); +}; +testOptimized(addF32Fast, () => addF32Fast(123.123, 456.789)); + +function addF64Fast(a, b) { + return dylib.symbols.add_f64(a, b); +}; +testOptimized(addF64Fast, () => addF64Fast(123.123, 456.789)); + // Test adders as nonblocking calls console.log(await dylib.symbols.add_i32_nonblocking(123, 456)); console.log(await dylib.symbols.add_u64_nonblocking(0xffffffffn, 0xffffffffn)); @@ -437,6 +454,39 @@ call_stored_function(); dylib.symbols.store_function_2(add10Callback.pointer); dylib.symbols.call_stored_function_2(20); +function logManyParametersFast(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) { + return symbols.log_many_parameters(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); +}; +testOptimized( + logManyParametersFast, + () => logManyParametersFast( + 255, 65535, 4294967295, 4294967296, 123.456, 789.876, -1, -2, -3, -4, -1000, 1000, + 12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910 + ) +); + +// Some ABIs rely on the convention to zero/sign-extend arguments by the caller to optimize the callee function. +// If the trampoline did not zero/sign-extend arguments, this would return 256 instead of the expected 0 (in optimized builds) +function castU8U32Fast(x) { return symbols.cast_u8_u32(x); }; +testOptimized(castU8U32Fast, () => castU8U32Fast(256)); + +// Some ABIs rely on the convention to expect garbage in the bits beyond the size of the return value to optimize the callee function. +// If the trampoline did not zero/sign-extend the return value, this would return 256 instead of the expected 0 (in optimized builds) +function castU32U8Fast(x) { return symbols.cast_u32_u8(x); }; +testOptimized(castU32U8Fast, () => castU32U8Fast(256)); + +// Generally the trampoline tail-calls into the FFI function, but in certain cases (e.g. when returning 8 or 16 bit integers) +// the tail call is not possible and a new stack frame must be created. We need enough parameters to have some on the stack +function addManyU16Fast(a, b, c, d, e, f, g, h, i, j, k, l, m) { + return symbols.add_many_u16(a, b, c, d, e, f, g, h, i, j, k, l, m); +}; +// N.B. V8 does not currently follow Aarch64 Apple's calling convention. +// The current implementation of the JIT trampoline follows the V8 incorrect calling convention. This test covers the use-case +// and is expected to fail once Deno uses a V8 version with the bug fixed. +// The V8 bug is being tracked in https://bugs.chromium.org/p/v8/issues/detail?id=13171 +testOptimized(addManyU16Fast, () => addManyU16Fast(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)); + + const nestedCallback = new Deno.UnsafeCallback( { parameters: [], result: "void" }, () => { @@ -502,6 +552,12 @@ try { console.log("Invalid UTF-8 characters to `v8::String`:", charView.getCString()); } + +const bytes = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); +function hash() { return dylib.symbols.hash(bytes, bytes.byteLength); }; + +testOptimized(hash, () => hash()); + (function cleanup() { dylib.close(); throwCallback.close(); @@ -526,4 +582,23 @@ After: ${postStr}`, } console.log("Correct number of resources"); -})(); \ No newline at end of file +})(); + +function assertIsOptimized(fn) { + const status = % GetOptimizationStatus(fn); + assert(status & (1 << 4), `expected ${fn.name} to be optimized, but wasn't`); +} + +function testOptimized(fn, callback) { + %PrepareFunctionForOptimization(fn); + const r1 = callback(); + if (r1 !== undefined) { + console.log(r1); + } + %OptimizeFunctionOnNextCall(fn); + const r2 = callback(); + if (r2 !== undefined) { + console.log(r2); + } + assertIsOptimized(fn); +} \ No newline at end of file diff --git a/test_util/Cargo.toml b/test_util/Cargo.toml index 7fa309dac4..ce5ad244d4 100644 --- a/test_util/Cargo.toml +++ b/test_util/Cargo.toml @@ -40,4 +40,4 @@ tokio-tungstenite = "0.16" pty = "0.2.2" [target.'cfg(windows)'.dependencies] -winapi = { version = "0.3.9", features = ["consoleapi", "handleapi", "namedpipeapi", "winbase", "winerror"] } +winapi = { version = "0.3.9", features = ["consoleapi", "synchapi", "handleapi", "namedpipeapi", "winbase", "winerror"] }