From 8bdc3c2bafa9bdfcd6bfaf94b97b960843908ae9 Mon Sep 17 00:00:00 2001
From: Arnau Orriols <arnauorriolsmiro@gmail.com>
Date: Wed, 7 Sep 2022 08:53:56 +0200
Subject: [PATCH] feat(ext/ffi): Implement FFI fast-call trampoline with
 Dynasmrt (#15305)

---
 .gitmodules                         |    4 +-
 Cargo.lock                          |   37 +
 ext/ffi/Cargo.toml                  |    1 +
 ext/ffi/build.rs                    |   70 -
 ext/ffi/fast_call.rs                | 2065 +++++++++++++++++++++++++++
 ext/ffi/jit_trampoline.rs           |  263 ----
 ext/ffi/lib.rs                      |  114 +-
 ext/ffi/prelude.h                   |   36 -
 ext/ffi/tcc.rs                      |  116 --
 ext/ffi/tinycc                      |    1 -
 test_ffi/Cargo.toml                 |    1 +
 test_ffi/src/lib.rs                 |   54 +
 test_ffi/tests/integration_tests.rs |   14 +
 test_ffi/tests/test.js              |  105 +-
 test_util/Cargo.toml                |    2 +-
 15 files changed, 2282 insertions(+), 601 deletions(-)
 delete mode 100644 ext/ffi/build.rs
 create mode 100644 ext/ffi/fast_call.rs
 delete mode 100644 ext/ffi/jit_trampoline.rs
 delete mode 100644 ext/ffi/prelude.h
 delete mode 100644 ext/ffi/tcc.rs
 delete mode 160000 ext/ffi/tinycc

diff --git a/.gitmodules b/.gitmodules
index a94ebe6689..9e4f12afa6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,6 +9,4 @@
 [submodule "test_util/wpt"]
 	path = test_util/wpt
 	url = https://github.com/web-platform-tests/wpt.git
-[submodule "ext/ffi/tinycc"]
-	path = ext/ffi/tinycc
-	url = https://github.com/TinyCC/tinycc
+
diff --git a/Cargo.lock b/Cargo.lock
index 7a35eafd2b..fba5040678 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1035,6 +1035,7 @@ version = "0.54.0"
 dependencies = [
  "deno_core",
  "dlopen",
+ "dynasmrt",
  "libffi",
  "serde",
  "tokio",
@@ -1467,6 +1468,32 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "21e50f3adc76d6a43f5ed73b698a87d0760ca74617f60f7c3b879003536fdd28"
 
+[[package]]
+name = "dynasm"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "add9a102807b524ec050363f09e06f1504214b0e1c7797f64261c891022dce8b"
+dependencies = [
+ "bitflags",
+ "byteorder",
+ "lazy_static",
+ "proc-macro-error",
+ "proc-macro2 1.0.39",
+ "quote 1.0.18",
+ "syn 1.0.96",
+]
+
+[[package]]
+name = "dynasmrt"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64fba5a42bd76a17cad4bfa00de168ee1cbfa06a5e8ce992ae880218c05641a9"
+dependencies = [
+ "byteorder",
+ "dynasm",
+ "memmap2",
+]
+
 [[package]]
 name = "ecdsa"
 version = "0.14.1"
@@ -2657,6 +2684,15 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
+[[package]]
+name = "memmap2"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a79b39c93a7a5a27eeaf9a23b5ff43f1b9e0ad6b1cdd441140ae53c35613fc7"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.6.5"
@@ -4590,6 +4626,7 @@ dependencies = [
 name = "test_ffi"
 version = "0.1.0"
 dependencies = [
+ "pretty_assertions",
  "test_util",
 ]
 
diff --git a/ext/ffi/Cargo.toml b/ext/ffi/Cargo.toml
index af866317f1..be094c3dcd 100644
--- a/ext/ffi/Cargo.toml
+++ b/ext/ffi/Cargo.toml
@@ -16,6 +16,7 @@ path = "lib.rs"
 [dependencies]
 deno_core = { version = "0.149.0", path = "../../core" }
 dlopen = "0.1.8"
+dynasmrt = "1.2.3"
 libffi = "3.0.0"
 serde = { version = "1.0.129", features = ["derive"] }
 tokio = { version = "1.17", features = ["full"] }
diff --git a/ext/ffi/build.rs b/ext/ffi/build.rs
deleted file mode 100644
index 1debd6b9c0..0000000000
--- a/ext/ffi/build.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
-
-#[cfg(not(target_os = "windows"))]
-fn build_tcc() {
-  use std::env;
-
-  {
-    // TODO(@littledivy): Windows support for fast call.
-    // let tcc_path = root
-    //   .parent()
-    //   .unwrap()
-    //   .to_path_buf()
-    //   .parent()
-    //   .unwrap()
-    //   .to_path_buf()
-    //   .join("third_party")
-    //   .join("prebuilt")
-    //   .join("win");
-    // println!("cargo:rustc-link-search=native={}", tcc_path.display());
-  }
-  #[cfg(not(target_os = "windows"))]
-  {
-    use std::path::PathBuf;
-    use std::process::exit;
-    use std::process::Command;
-
-    let root = PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR")));
-    let tcc_src = root.join("tinycc");
-    dbg!(&tcc_src);
-    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let mut configure = Command::new(tcc_src.join("configure"));
-    configure.current_dir(&out_dir);
-    configure.args(&["--enable-static", "--extra-cflags=-fPIC -O3 -g -static"]);
-    let status = configure.status().unwrap();
-    if !status.success() {
-      eprintln!("Fail to configure: {:?}", status);
-      exit(1);
-    }
-
-    let mut make = Command::new("make");
-    make.current_dir(&out_dir).arg(format!(
-      "-j{}",
-      env::var("NUM_JOBS").unwrap_or_else(|_| String::from("1"))
-    ));
-    make.args(&["libtcc.a"]);
-    let status = make.status().unwrap();
-
-    if !status.success() {
-      eprintln!("Fail to make: {:?}", status);
-      exit(1);
-    }
-    println!("cargo:rustc-link-search=native={}", out_dir.display());
-    println!("cargo:rerun-if-changed={}", tcc_src.display());
-  }
-}
-
-#[cfg(target_os = "windows")]
-fn main() {}
-
-#[cfg(not(target_os = "windows"))]
-fn main() {
-  use std::env;
-
-  if let Ok(tcc_path) = env::var("TCC_PATH") {
-    println!("cargo:rustc-link-search=native={}", tcc_path);
-  } else {
-    build_tcc();
-  }
-  println!("cargo:rustc-link-lib=static=tcc");
-}
diff --git a/ext/ffi/fast_call.rs b/ext/ffi/fast_call.rs
new file mode 100644
index 0000000000..dc098a69aa
--- /dev/null
+++ b/ext/ffi/fast_call.rs
@@ -0,0 +1,2065 @@
+// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
+
+use std::cmp::max;
+use std::ffi::c_void;
+use std::iter::once;
+
+use deno_core::v8::fast_api;
+use dynasmrt::dynasm;
+use dynasmrt::DynasmApi;
+use dynasmrt::ExecutableBuffer;
+
+use crate::needs_unwrap;
+use crate::NativeType;
+use crate::Symbol;
+
+pub(crate) fn is_compatible(sym: &Symbol) -> bool {
+  cfg!(any(
+    all(target_arch = "x86_64", target_family = "unix"),
+    all(target_arch = "x86_64", target_family = "windows"),
+    all(target_arch = "aarch64", target_vendor = "apple")
+  )) && !sym.can_callback
+}
+
+pub(crate) fn compile_trampoline(sym: &Symbol) -> Trampoline {
+  #[cfg(all(target_arch = "x86_64", target_family = "unix"))]
+  return SysVAmd64::compile(sym);
+  #[cfg(all(target_arch = "x86_64", target_family = "windows"))]
+  return Win64::compile(sym);
+  #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+  return Aarch64Apple::compile(sym);
+  #[allow(unreachable_code)]
+  {
+    unimplemented!("fast API is not implemented for the current target");
+  }
+}
+
+pub(crate) fn make_template(sym: &Symbol, trampoline: &Trampoline) -> Template {
+  let mut params = once(fast_api::Type::V8Value) // Receiver
+    .chain(sym.parameter_types.iter().map(|t| t.into()))
+    .collect::<Vec<_>>();
+
+  let ret = if needs_unwrap(sym.result_type) {
+    params.push(fast_api::Type::TypedArray(fast_api::CType::Int32));
+    fast_api::Type::Void
+  } else {
+    fast_api::Type::from(&sym.result_type)
+  };
+
+  Template {
+    args: params.into_boxed_slice(),
+    ret: (&ret).into(),
+    symbol_ptr: trampoline.ptr(),
+  }
+}
+
+/// Trampoline for fast-call FFI functions
+///
+/// Calls the FFI function without the first argument (the receiver)
+pub(crate) struct Trampoline(ExecutableBuffer);
+
+impl Trampoline {
+  fn ptr(&self) -> *const c_void {
+    &self.0[0] as *const u8 as *const c_void
+  }
+}
+
+pub(crate) struct Template {
+  args: Box<[fast_api::Type]>,
+  ret: fast_api::CType,
+  symbol_ptr: *const c_void,
+}
+
+impl fast_api::FastFunction for Template {
+  fn function(&self) -> *const c_void {
+    self.symbol_ptr
+  }
+
+  fn args(&self) -> &'static [fast_api::Type] {
+    Box::leak(self.args.clone())
+  }
+
+  fn return_type(&self) -> fast_api::CType {
+    self.ret
+  }
+}
+
+impl From<&NativeType> for fast_api::Type {
+  fn from(native_type: &NativeType) -> Self {
+    match native_type {
+      NativeType::Bool => fast_api::Type::Bool,
+      NativeType::U8 | NativeType::U16 | NativeType::U32 => {
+        fast_api::Type::Uint32
+      }
+      NativeType::I8 | NativeType::I16 | NativeType::I32 => {
+        fast_api::Type::Int32
+      }
+      NativeType::F32 => fast_api::Type::Float32,
+      NativeType::F64 => fast_api::Type::Float64,
+      NativeType::Void => fast_api::Type::Void,
+      NativeType::I64 => fast_api::Type::Int64,
+      NativeType::U64 => fast_api::Type::Uint64,
+      NativeType::ISize => fast_api::Type::Int64,
+      NativeType::USize | NativeType::Pointer | NativeType::Function => {
+        fast_api::Type::Uint64
+      }
+      NativeType::Buffer => fast_api::Type::TypedArray(fast_api::CType::Uint8),
+    }
+  }
+}
+
+macro_rules! x64 {
+  ($assembler:expr; $($tokens:tt)+) => {
+    dynasm!($assembler; .arch x64; $($tokens)+)
+  }
+}
+
+macro_rules! aarch64 {
+  ($assembler:expr; $($tokens:tt)+) => {
+    dynasm!($assembler; .arch aarch64; $($tokens)+)
+  }
+}
+
+struct SysVAmd64 {
+  // Reference: https://refspecs.linuxfoundation.org/elf/x86_64-abi-0.99.pdf
+  assmblr: dynasmrt::x64::Assembler,
+  // Parameter counters
+  integral_params: u32,
+  float_params: u32,
+  // Stack offset accumulators
+  offset_trampoline: u32,
+  offset_callee: u32,
+  allocated_stack: u32,
+  frame_pointer: u32,
+}
+
+#[cfg_attr(
+  not(all(target_aarch = "x86_64", target_family = "unix")),
+  allow(dead_code)
+)]
+impl SysVAmd64 {
+  // Integral arguments go to the following GPR, in order: rdi, rsi, rdx, rcx, r8, r9
+  const INTEGRAL_REGISTERS: u32 = 6;
+  // SSE arguments go to the first 8 SSE registers: xmm0-xmm7
+  const FLOAT_REGISTERS: u32 = 8;
+
+  fn new() -> Self {
+    Self {
+      assmblr: dynasmrt::x64::Assembler::new().unwrap(),
+      integral_params: 0,
+      float_params: 0,
+      // Start at 8 to account for trampoline caller's return address
+      offset_trampoline: 8,
+      // default to tail-call mode. If a new stack frame is allocated this becomes 0
+      offset_callee: 8,
+      allocated_stack: 0,
+      frame_pointer: 0,
+    }
+  }
+
+  fn compile(sym: &Symbol) -> Trampoline {
+    let mut compiler = Self::new();
+
+    let must_cast_return_value =
+      compiler.must_cast_return_value(sym.result_type);
+    let must_wrap_return_value =
+      compiler.must_wrap_return_value_in_typed_array(sym.result_type);
+    let must_save_preserved_register = must_wrap_return_value;
+    let cannot_tailcall = must_cast_return_value || must_wrap_return_value;
+
+    if cannot_tailcall {
+      if must_save_preserved_register {
+        compiler.save_preserved_register_to_stack();
+      }
+      compiler.allocate_stack(&sym.parameter_types);
+    }
+
+    for param in sym.parameter_types.iter().copied() {
+      compiler.move_left(param)
+    }
+    if !compiler.is_recv_arg_overridden() {
+      // the receiver object should never be expected. Avoid its unexpected or deliberate leak
+      compiler.zero_first_arg();
+    }
+    if must_wrap_return_value {
+      compiler.save_out_array_to_preserved_register();
+    }
+
+    if cannot_tailcall {
+      compiler.call(sym.ptr.as_ptr());
+      if must_cast_return_value {
+        compiler.cast_return_value(sym.result_type);
+      }
+      if must_wrap_return_value {
+        compiler.wrap_return_value_in_out_array();
+      }
+      compiler.deallocate_stack();
+      if must_save_preserved_register {
+        compiler.recover_preserved_register();
+      }
+      compiler.ret();
+    } else {
+      compiler.tailcall(sym.ptr.as_ptr());
+    }
+
+    Trampoline(compiler.finalize())
+  }
+
+  fn move_left(&mut self, param: NativeType) {
+    // Section 3.2.3 of the SysV ABI spec, on argument classification:
+    // - INTEGER:
+    //    > Arguments of types (signed and unsigned) _Bool, char, short, int,
+    //    > long, long long, and pointers are in the INTEGER class.
+    // - SSE:
+    //    > Arguments of types float, double, _Decimal32, _Decimal64 and
+    //    > __m64 are in class SSE.
+    match param.into() {
+      Int(integral) => self.move_integral(integral),
+      Float(float) => self.move_float(float),
+    }
+  }
+
+  fn move_float(&mut self, param: Floating) {
+    // Section 3.2.3 of the SysV AMD64 ABI:
+    // > If the class is SSE, the next available vector register is used, the registers
+    // > are taken in the order from %xmm0 to %xmm7.
+    // [...]
+    // > Once registers are assigned, the arguments passed in memory are pushed on
+    // > the stack in reversed (right-to-left) order
+    let param_i = self.float_params;
+
+    let is_in_stack = param_i >= Self::FLOAT_REGISTERS;
+    // floats are only moved to accommodate integer movement in the stack
+    let stack_has_moved = self.allocated_stack > 0
+      || self.integral_params >= Self::INTEGRAL_REGISTERS;
+
+    if is_in_stack && stack_has_moved {
+      let s = &mut self.assmblr;
+      let ot = self.offset_trampoline as i32;
+      let oc = self.offset_callee as i32;
+      match param {
+        Single => x64!(s
+          ; movss xmm8, [rsp + ot]
+          ; movss [rsp + oc], xmm8
+        ),
+        Double => x64!(s
+          ; movsd xmm8, [rsp + ot]
+          ; movsd [rsp + oc], xmm8
+        ),
+      }
+
+      // Section 3.2.3 of the SysV AMD64 ABI:
+      // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned.
+      self.offset_trampoline += 8;
+      self.offset_callee += 8;
+
+      debug_assert!(
+        self.allocated_stack == 0 || self.offset_callee <= self.allocated_stack
+      );
+    }
+    self.float_params += 1;
+  }
+
+  fn move_integral(&mut self, arg: Integral) {
+    // Section 3.2.3 of the SysV AMD64 ABI:
+    // > If the class is INTEGER, the next available register of the sequence %rdi,
+    // > %rsi, %rdx, %rcx, %r8 and %r9 is used
+    // [...]
+    // > Once registers are assigned, the arguments passed in memory are pushed on
+    // > the stack in reversed (right-to-left) order
+    let s = &mut self.assmblr;
+    let param_i = self.integral_params;
+
+    // move each argument one position to the left. The first argument in the stack moves to the last integer register (r9).
+    // If the FFI function is called with a new stack frame, the arguments remaining in the stack are copied to the new stack frame.
+    // Otherwise, they are copied 8 bytes lower in the same frame
+    match (param_i, arg) {
+      // u8 and u16 parameters are defined as u32 parameters in the V8's fast API function. The trampoline takes care of the cast.
+      // Conventionally, many compilers expect 8 and 16 bit arguments to be sign/zero extended by the caller
+      // See https://stackoverflow.com/a/36760539/2623340
+      (0, U(B)) => x64!(s; movzx edi, sil),
+      (0, I(B)) => x64!(s; movsx edi, sil),
+      (0, U(W)) => x64!(s; movzx edi, si),
+      (0, I(W)) => x64!(s; movsx edi, si),
+      (0, U(DW) | I(DW)) => x64!(s; mov edi, esi),
+      (0, U(QW) | I(QW)) => x64!(s; mov rdi, rsi),
+      // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray<Uint8> struct
+      // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200
+      // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823
+      (0, Buffer) => x64!(s; mov rdi, [rsi + 8]),
+
+      (1, U(B)) => x64!(s; movzx esi, dl),
+      (1, I(B)) => x64!(s; movsx esi, dl),
+      (1, U(W)) => x64!(s; movzx esi, dx),
+      (1, I(W)) => x64!(s; movsx esi, dx),
+      (1, U(DW) | I(DW)) => x64!(s; mov esi, edx),
+      (1, U(QW) | I(QW)) => x64!(s; mov rsi, rdx),
+      (1, Buffer) => x64!(s; mov rsi, [rdx + 8]),
+
+      (2, U(B)) => x64!(s; movzx edx, cl),
+      (2, I(B)) => x64!(s; movsx edx, cl),
+      (2, U(W)) => x64!(s; movzx edx, cx),
+      (2, I(W)) => x64!(s; movsx edx, cx),
+      (2, U(DW) | I(DW)) => x64!(s; mov edx, ecx),
+      (2, U(QW) | I(QW)) => x64!(s; mov rdx, rcx),
+      (2, Buffer) => x64!(s; mov rdx, [rcx + 8]),
+
+      (3, U(B)) => x64!(s; movzx ecx, r8b),
+      (3, I(B)) => x64!(s; movsx ecx, r8b),
+      (3, U(W)) => x64!(s; movzx ecx, r8w),
+      (3, I(W)) => x64!(s; movsx ecx, r8w),
+      (3, U(DW) | I(DW)) => x64!(s; mov ecx, r8d),
+      (3, U(QW) | I(QW)) => x64!(s; mov rcx, r8),
+      (3, Buffer) => x64!(s; mov rcx, [r8 + 8]),
+
+      (4, U(B)) => x64!(s; movzx r8d, r9b),
+      (4, I(B)) => x64!(s; movsx r8d, r9b),
+      (4, U(W)) => x64!(s; movzx r8d, r9w),
+      (4, I(W)) => x64!(s; movsx r8d, r9w),
+      (4, U(DW) | I(DW)) => x64!(s; mov r8d, r9d),
+      (4, U(QW) | I(QW)) => x64!(s; mov r8, r9),
+      (4, Buffer) => x64!(s; mov r8, [r9 + 8]),
+
+      (5, param) => {
+        let ot = self.offset_trampoline as i32;
+        // First argument in stack goes to last register (r9)
+        match param {
+          U(B) => x64!(s; movzx r9d, BYTE [rsp + ot]),
+          I(B) => x64!(s; movsx r9d, BYTE [rsp + ot]),
+          U(W) => x64!(s; movzx r9d, WORD [rsp + ot]),
+          I(W) => x64!(s; movsx r9d, WORD [rsp + ot]),
+          U(DW) | I(DW) => x64!(s; mov r9d, [rsp + ot]),
+          U(QW) | I(QW) => x64!(s; mov r9, [rsp + ot]),
+          Buffer => x64!(s
+            ; mov r9, [rsp + ot]
+            ; mov r9, [r9 + 8]
+          ),
+        }
+        // Section 3.2.3 of the SysV AMD64 ABI:
+        // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned.
+        self.offset_trampoline += 8;
+      }
+
+      (6.., param) => {
+        let ot = self.offset_trampoline as i32;
+        let oc = self.offset_callee as i32;
+        match param {
+          U(B) => x64!(s
+            // TODO: optimize to [rsp] (without immediate) when offset is 0
+            ; movzx eax, BYTE [rsp + ot]
+            ; mov [rsp + oc], eax
+          ),
+          I(B) => x64!(s
+            ; movsx eax, BYTE [rsp + ot]
+            ; mov [rsp + oc], eax
+          ),
+          U(W) => x64!(s
+            ; movzx eax, WORD [rsp + ot]
+            ; mov [rsp + oc], eax
+          ),
+          I(W) => x64!(s
+            ; movsx eax, WORD [rsp + ot]
+            ; mov [rsp + oc], eax
+          ),
+          U(DW) | I(DW) => x64!(s
+            ; mov eax, [rsp + ot]
+            ; mov [rsp + oc], eax
+          ),
+          U(QW) | I(QW) => x64!(s
+            ; mov rax, [rsp + ot]
+            ; mov [rsp + oc], rax
+          ),
+          Buffer => x64!(s
+            ; mov rax, [rsp + ot]
+            ; mov rax, [rax + 8]
+            ; mov [rsp + oc], rax
+          ),
+        }
+        // Section 3.2.3 of the SysV AMD64 ABI:
+        // > The size of each argument gets rounded up to eightbytes. [...] Therefore the stack will always be eightbyte aligned.
+        self.offset_trampoline += 8;
+        self.offset_callee += 8;
+
+        debug_assert!(
+          self.allocated_stack == 0
+            || self.offset_callee <= self.allocated_stack
+        );
+      }
+    }
+    self.integral_params += 1;
+  }
+
+  fn zero_first_arg(&mut self) {
+    debug_assert!(
+      self.integral_params == 0,
+      "the trampoline would zero the first argument after having overridden it with the second one"
+    );
+    dynasm!(self.assmblr
+      ; .arch x64
+      ; xor edi, edi
+    );
+  }
+
+  fn cast_return_value(&mut self, rv: NativeType) {
+    let s = &mut self.assmblr;
+    // V8 only supports 32bit integers. We support 8 and 16 bit integers casting them to 32bits.
+    // In SysV-AMD64 the convention dictates that the unused bits of the return value contain garbage, so we
+    // need to zero/sign extend the return value explicitly
+    match rv {
+      NativeType::U8 => x64!(s; movzx eax, al),
+      NativeType::I8 => x64!(s; movsx eax, al),
+      NativeType::U16 => x64!(s; movzx eax, ax),
+      NativeType::I16 => x64!(s; movsx eax, ax),
+      _ => (),
+    }
+  }
+
+  fn save_out_array_to_preserved_register(&mut self) {
+    let s = &mut self.assmblr;
+    // functions returning 64 bit integers have the out array appended as their last parameter,
+    // and it is a *FastApiTypedArray<Int32>
+    match self.integral_params {
+      // Trampoline's signature is (receiver, [param0, param1, ...], *FastApiTypedArray)
+      // self.integral_params account only for the original params [param0, param1, ...]
+      // and the out array has not been moved left
+      0 => x64!(s; mov rbx, [rsi + 8]),
+      1 => x64!(s; mov rbx, [rdx + 8]),
+      2 => x64!(s; mov rbx, [rcx + 8]),
+      3 => x64!(s; mov rbx, [r8 + 8]),
+      4 => x64!(s; mov rbx, [r9 + 8]),
+      5.. => {
+        x64!(s
+          ; mov rax, [rsp + self.offset_trampoline as i32]
+          ; mov rbx, [rax + 8]
+        )
+      }
+    }
+  }
+
+  fn wrap_return_value_in_out_array(&mut self) {
+    x64!(self.assmblr; mov [rbx], rax);
+  }
+
+  fn save_preserved_register_to_stack(&mut self) {
+    x64!(self.assmblr; push rbx);
+    self.offset_trampoline += 8;
+    // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack
+    self.offset_callee = 0;
+    self.frame_pointer += 8;
+  }
+
+  fn recover_preserved_register(&mut self) {
+    debug_assert!(
+      self.frame_pointer >= 8,
+      "the trampoline would try to pop from the stack beyond its frame pointer"
+    );
+    x64!(self.assmblr; pop rbx);
+    self.frame_pointer -= 8;
+    // parameter offsets are invalid once this method is called
+  }
+
+  fn allocate_stack(&mut self, params: &[NativeType]) {
+    let mut int_params = 0u32;
+    let mut float_params = 0u32;
+    for param in params {
+      match param {
+        NativeType::F32 | NativeType::F64 => float_params += 1,
+        _ => int_params += 1,
+      }
+    }
+    let mut stack_size = (int_params.saturating_sub(Self::INTEGRAL_REGISTERS)
+      + float_params.saturating_sub(Self::FLOAT_REGISTERS))
+      * 8;
+
+    // Align new stack frame (accounting for the 8 byte of the trampoline caller's return address
+    // and any other potential addition to the stack prior to this allocation)
+    // Section 3.2.2 of the SysV AMD64 ABI:
+    // > The end of the input argument area shall be aligned on a 16 (32 or 64, if
+    // > __m256 or __m512 is passed on stack) byte boundary. In other words, the value
+    // > (%rsp + 8) is always a multiple of 16 (32 or 64) when control is transferred to
+    // > the function entry point. The stack pointer, %rsp, always points to the end of the
+    // > latest allocated stack frame.
+    stack_size += padding_to_align(16, self.frame_pointer + stack_size + 8);
+
+    if stack_size > 0 {
+      x64!(self.assmblr; sub rsp, stack_size as i32);
+      self.offset_trampoline += stack_size;
+      // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack
+      self.offset_callee = 0;
+      self.allocated_stack += stack_size;
+      self.frame_pointer += stack_size;
+    }
+  }
+
+  fn deallocate_stack(&mut self) {
+    debug_assert!(
+      self.frame_pointer >= self.allocated_stack,
+      "the trampoline would try to deallocate stack beyond its frame pointer"
+    );
+    if self.allocated_stack > 0 {
+      x64!(self.assmblr; add rsp, self.allocated_stack as i32);
+
+      self.frame_pointer -= self.allocated_stack;
+      self.allocated_stack = 0;
+    }
+  }
+
+  fn call(&mut self, ptr: *const c_void) {
+    // the stack has been aligned during stack allocation and/or pushing of preserved registers
+    debug_assert!(
+      (8 + self.frame_pointer) % 16 == 0,
+      "the trampoline would call the FFI function with an unaligned stack"
+    );
+    x64!(self.assmblr
+      ; mov rax, QWORD ptr as _
+      ; call rax
+    );
+  }
+
+  fn tailcall(&mut self, ptr: *const c_void) {
+    // stack pointer is never modified and remains aligned
+    // return address remains the one provided by the trampoline's caller (V8)
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would tail call the FFI function with an outstanding stack allocation"
+    );
+    debug_assert!(
+      self.frame_pointer == 0,
+      "the trampoline would tail call the FFI function with outstanding locals in the frame"
+    );
+    x64!(self.assmblr
+      ; mov rax, QWORD ptr as _
+      ; jmp rax
+    );
+  }
+
+  fn ret(&mut self) {
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would return with an outstanding stack allocation"
+    );
+    debug_assert!(
+      self.frame_pointer == 0,
+      "the trampoline would return with outstanding locals in the frame"
+    );
+    x64!(self.assmblr; ret);
+  }
+
+  fn is_recv_arg_overridden(&self) -> bool {
+    // V8 receiver is the first parameter of the trampoline function and is a pointer
+    self.integral_params > 0
+  }
+
+  fn must_cast_return_value(&self, rv: NativeType) -> bool {
+    // V8 only supports i32 and u32 return types for integers
+    // We support 8 and 16 bit integers by extending them to 32 bits in the trampoline before returning
+    matches!(
+      rv,
+      NativeType::U8 | NativeType::I8 | NativeType::U16 | NativeType::I16
+    )
+  }
+
+  fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool {
+    // V8 only supports i32 and u32 return types for integers
+    // We support 64 bit integers by wrapping them in a TypedArray out parameter
+    crate::needs_unwrap(rv)
+  }
+
+  fn finalize(self) -> ExecutableBuffer {
+    self.assmblr.finalize().unwrap()
+  }
+}
+
+struct Aarch64Apple {
+  // Reference https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst
+  assmblr: dynasmrt::aarch64::Assembler,
+  // Parameter counters
+  integral_params: u32,
+  float_params: u32,
+  // Stack offset accumulators
+  offset_trampoline: u32,
+  offset_callee: u32,
+  allocated_stack: u32,
+}
+
+#[cfg_attr(
+  not(all(target_aarch = "aarch64", target_vendor = "apple")),
+  allow(dead_code)
+)]
+impl Aarch64Apple {
+  // Integral arguments go to the first 8 GPR: x0-x7
+  const INTEGRAL_REGISTERS: u32 = 8;
+  // Floating-point arguments go to the first 8 SIMD & Floating-Point registers: v0-v1
+  const FLOAT_REGISTERS: u32 = 8;
+
+  fn new() -> Self {
+    Self {
+      assmblr: dynasmrt::aarch64::Assembler::new().unwrap(),
+      integral_params: 0,
+      float_params: 0,
+      offset_trampoline: 0,
+      offset_callee: 0,
+      allocated_stack: 0,
+    }
+  }
+
+  fn compile(sym: &Symbol) -> Trampoline {
+    let mut compiler = Self::new();
+
+    let must_wrap_return_value =
+      compiler.must_wrap_return_value_in_typed_array(sym.result_type);
+    let must_save_preserved_register = must_wrap_return_value;
+    let cannot_tailcall = must_wrap_return_value;
+
+    if cannot_tailcall {
+      compiler.allocate_stack(sym);
+      compiler.save_frame_record();
+      if compiler.must_save_preserved_register_to_stack(sym) {
+        compiler.save_preserved_register_to_stack();
+      }
+    }
+
+    for param in sym.parameter_types.iter().copied() {
+      compiler.move_left(param)
+    }
+    if !compiler.is_recv_arg_overridden() {
+      // the receiver object should never be expected. Avoid its unexpected or deliberate leak
+      compiler.zero_first_arg();
+    }
+    if compiler.must_wrap_return_value_in_typed_array(sym.result_type) {
+      compiler.save_out_array_to_preserved_register();
+    }
+
+    if cannot_tailcall {
+      compiler.call(sym.ptr.as_ptr());
+      if must_wrap_return_value {
+        compiler.wrap_return_value_in_out_array();
+      }
+      if must_save_preserved_register {
+        compiler.recover_preserved_register();
+      }
+      compiler.recover_frame_record();
+      compiler.deallocate_stack();
+      compiler.ret();
+    } else {
+      compiler.tailcall(sym.ptr.as_ptr());
+    }
+
+    Trampoline(compiler.finalize())
+  }
+
+  fn move_left(&mut self, param: NativeType) {
+    // Section 6.4.2 of the Aarch64 Procedure Call Standard (PCS), on argument classification:
+    // - INTEGRAL or POINTER:
+    //    > If the argument is an Integral or Pointer Type, the size of the argument is less than or equal to 8 bytes
+    //    > and the NGRN is less than 8, the argument is copied to the least significant bits in x[NGRN].
+    //
+    // - Floating-Point or Vector:
+    //    > If the argument is a Half-, Single-, Double- or Quad- precision Floating-point or short vector type
+    //    > and the NSRN is less than 8, then the argument is allocated to the least significant bits of register v[NSRN]
+    match param.into() {
+      Int(integral) => self.move_integral(integral),
+      Float(float) => self.move_float(float),
+    }
+  }
+
+  fn move_float(&mut self, param: Floating) {
+    // Section 6.4.2 of the Aarch64 PCS:
+    // > If the argument is a Half-, Single-, Double- or Quad- precision Floating-point or short vector type and the NSRN is less than 8, then the
+    // > argument is allocated to the least significant bits of register v[NSRN]. The NSRN is incremented by one. The argument has now been allocated.
+    // > [if NSRN is equal or more than 8]
+    // > The argument is copied to memory at the adjusted NSAA. The NSAA is incremented by the size of the argument. The argument has now been allocated.
+    let param_i = self.float_params;
+
+    let is_in_stack = param_i >= Self::FLOAT_REGISTERS;
+    if is_in_stack {
+      // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms:
+      // > Function arguments may consume slots on the stack that are not multiples of 8 bytes.
+      // (i.e. natural alignment instead of eightbyte alignment)
+      let padding_trampl =
+        (param.size() - self.offset_trampoline % param.size()) % param.size();
+      let padding_callee =
+        (param.size() - self.offset_callee % param.size()) % param.size();
+
+      // floats are only moved to accommodate integer movement in the stack
+      let stack_has_moved = self.integral_params >= Self::INTEGRAL_REGISTERS;
+      if stack_has_moved {
+        let s = &mut self.assmblr;
+        let ot = self.offset_trampoline;
+        let oc = self.offset_callee;
+        match param {
+          Single => aarch64!(s
+            // 6.1.2 Aarch64 PCS:
+            // > Registers v8-v15 must be preserved by a callee across subroutine calls;
+            // > the remaining registers (v0-v7, v16-v31) do not need to be preserved (or should be preserved by the caller).
+            ; ldr s16, [sp, ot + padding_trampl]
+            ; str s16, [sp, oc + padding_callee]
+          ),
+          Double => aarch64!(s
+            ; ldr d16, [sp, ot + padding_trampl]
+            ; str d16, [sp, oc + padding_callee]
+          ),
+        }
+      }
+      self.offset_trampoline += padding_trampl + param.size();
+      self.offset_callee += padding_callee + param.size();
+
+      debug_assert!(
+        self.allocated_stack == 0 || self.offset_callee <= self.allocated_stack
+      );
+    }
+    self.float_params += 1;
+  }
+
+  fn move_integral(&mut self, param: Integral) {
+    let s = &mut self.assmblr;
+    // Section 6.4.2 of the Aarch64 PCS:
+    // If the argument is an Integral or Pointer Type, the size of the argument is less than or
+    // equal to 8 bytes and the NGRN is less than 8, the argument is copied to the least
+    // significant bits in x[NGRN]. The NGRN is incremented by one. The argument has now been
+    // allocated.
+    // [if NGRN is equal or more than 8]
+    // The argument is copied to memory at the adjusted NSAA. The NSAA is incremented by the size
+    // of the argument. The argument has now been allocated.
+    let param_i = self.integral_params;
+
+    // move each argument one position to the left. The first argument in the stack moves to the last integer register (x7).
+    match (param_i, param) {
+      // From https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms:
+      // > The caller of a function is responsible for signing or zero-extending any argument with fewer than 32 bits.
+      // > The standard ABI expects the callee to sign or zero-extend those arguments.
+      // (this applies to register parameters, as stack parameters are not eightbyte aligned in Apple)
+      (0, I(B)) => aarch64!(s; sxtb w0, w1),
+      (0, U(B)) => aarch64!(s; and w0, w1, 0xFF),
+      (0, I(W)) => aarch64!(s; sxth w0, w1),
+      (0, U(W)) => aarch64!(s; and w0, w1, 0xFFFF),
+      (0, I(DW) | U(DW)) => aarch64!(s; mov w0, w1),
+      (0, I(QW) | U(QW)) => aarch64!(s; mov x0, x1),
+      // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray<Uint8> struct
+      // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200
+      // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823
+      (0, Buffer) => aarch64!(s; ldr x0, [x1, 8]),
+
+      (1, I(B)) => aarch64!(s; sxtb w1, w2),
+      (1, U(B)) => aarch64!(s; and w1, w2, 0xFF),
+      (1, I(W)) => aarch64!(s; sxth w1, w2),
+      (1, U(W)) => aarch64!(s; and w1, w2, 0xFFFF),
+      (1, I(DW) | U(DW)) => aarch64!(s; mov w1, w2),
+      (1, I(QW) | U(QW)) => aarch64!(s; mov x1, x2),
+      (1, Buffer) => aarch64!(s; ldr x1, [x2, 8]),
+
+      (2, I(B)) => aarch64!(s; sxtb w2, w3),
+      (2, U(B)) => aarch64!(s; and w2, w3, 0xFF),
+      (2, I(W)) => aarch64!(s; sxth w2, w3),
+      (2, U(W)) => aarch64!(s; and w2, w3, 0xFFFF),
+      (2, I(DW) | U(DW)) => aarch64!(s; mov w2, w3),
+      (2, I(QW) | U(QW)) => aarch64!(s; mov x2, x3),
+      (2, Buffer) => aarch64!(s; ldr x2, [x3, 8]),
+
+      (3, I(B)) => aarch64!(s; sxtb w3, w4),
+      (3, U(B)) => aarch64!(s; and w3, w4, 0xFF),
+      (3, I(W)) => aarch64!(s; sxth w3, w4),
+      (3, U(W)) => aarch64!(s; and w3, w4, 0xFFFF),
+      (3, I(DW) | U(DW)) => aarch64!(s; mov w3, w4),
+      (3, I(QW) | U(QW)) => aarch64!(s; mov x3, x4),
+      (3, Buffer) => aarch64!(s; ldr x3, [x4, 8]),
+
+      (4, I(B)) => aarch64!(s; sxtb w4, w5),
+      (4, U(B)) => aarch64!(s; and w4, w5, 0xFF),
+      (4, I(W)) => aarch64!(s; sxth w4, w5),
+      (4, U(W)) => aarch64!(s; and w4, w5, 0xFFFF),
+      (4, I(DW) | U(DW)) => aarch64!(s; mov w4, w5),
+      (4, I(QW) | U(QW)) => aarch64!(s; mov x4, x5),
+      (4, Buffer) => aarch64!(s; ldr x4, [x5, 8]),
+
+      (5, I(B)) => aarch64!(s; sxtb w5, w6),
+      (5, U(B)) => aarch64!(s; and w5, w6, 0xFF),
+      (5, I(W)) => aarch64!(s; sxth w5, w6),
+      (5, U(W)) => aarch64!(s; and w5, w6, 0xFFFF),
+      (5, I(DW) | U(DW)) => aarch64!(s; mov w5, w6),
+      (5, I(QW) | U(QW)) => aarch64!(s; mov x5, x6),
+      (5, Buffer) => aarch64!(s; ldr x5, [x6, 8]),
+
+      (6, I(B)) => aarch64!(s; sxtb w6, w7),
+      (6, U(B)) => aarch64!(s; and w6, w7, 0xFF),
+      (6, I(W)) => aarch64!(s; sxth w6, w7),
+      (6, U(W)) => aarch64!(s; and w6, w7, 0xFFFF),
+      (6, I(DW) | U(DW)) => aarch64!(s; mov w6, w7),
+      (6, I(QW) | U(QW)) => aarch64!(s; mov x6, x7),
+      (6, Buffer) => aarch64!(s; ldr x6, [x7, 8]),
+
+      (7, param) => {
+        let ot = self.offset_trampoline;
+        match param {
+          I(B) => {
+            aarch64!(s; ldrsb w7, [sp, ot])
+          }
+          U(B) => {
+            // ldrb zero-extends the byte to fill the 32bits of the register
+            aarch64!(s; ldrb w7, [sp, ot])
+          }
+          I(W) => {
+            aarch64!(s; ldrsh w7, [sp, ot])
+          }
+          U(W) => {
+            // ldrh zero-extends the half-word to fill the 32bits of the register
+            aarch64!(s; ldrh w7, [sp, ot])
+          }
+          I(DW) | U(DW) => {
+            aarch64!(s; ldr w7, [sp, ot])
+          }
+          I(QW) | U(QW) => {
+            aarch64!(s; ldr x7, [sp, ot])
+          }
+          Buffer => {
+            aarch64!(s
+              ; ldr x7, [sp, ot]
+              ; ldr x7, [x7, 8]
+            )
+          }
+        }
+        // 16 and 8 bit integers are 32 bit integers in v8
+        self.offset_trampoline += max(param.size(), 4);
+      }
+
+      (8.., param) => {
+        // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms:
+        // > Function arguments may consume slots on the stack that are not multiples of 8 bytes.
+        // (i.e. natural alignment instead of eightbyte alignment)
+        //
+        // N.B. V8 does not currently follow this Apple's policy, and instead aligns all arguments to 8 Byte boundaries.
+        // The current implementation follows the V8 incorrect calling convention for the sake of a seamless experience
+        // for the Deno users. Whenever upgrading V8 we should make sure that the bug has not been amended, and revert this
+        // workaround once it has been. The bug is being tracked in https://bugs.chromium.org/p/v8/issues/detail?id=13171
+        let size_original = param.size();
+        // 16 and 8 bit integers are 32 bit integers in v8
+        // let size_trampl = max(size_original, 4);  // <-- Apple alignment
+        let size_trampl = 8; // <-- V8 incorrect alignment
+        let padding_trampl =
+          padding_to_align(size_trampl, self.offset_trampoline);
+        let padding_callee =
+          padding_to_align(size_original, self.offset_callee);
+        let ot = self.offset_trampoline;
+        let oc = self.offset_callee;
+        match param {
+          I(B) | U(B) => aarch64!(s
+            ; ldr w8, [sp, ot + padding_trampl]
+            ; strb w8, [sp, oc + padding_callee]
+          ),
+          I(W) | U(W) => aarch64!(s
+            ; ldr w8, [sp, ot + padding_trampl]
+            ; strh w8, [sp, oc + padding_callee]
+          ),
+          I(DW) | U(DW) => aarch64!(s
+            ;  ldr w8, [sp, ot + padding_trampl]
+            ; str w8, [sp, oc + padding_callee]
+          ),
+          I(QW) | U(QW) => aarch64!(s
+            ; ldr x8, [sp, ot + padding_trampl]
+            ; str x8, [sp, oc + padding_callee]
+          ),
+          Buffer => aarch64!(s
+            ; ldr x8, [sp, ot + padding_trampl]
+            ; ldr x8, [x8, 8]
+            ; str x8, [sp, oc + padding_callee]
+          ),
+        }
+        self.offset_trampoline += padding_trampl + size_trampl;
+        self.offset_callee += padding_callee + size_original;
+
+        debug_assert!(
+          self.allocated_stack == 0
+            || self.offset_callee <= self.allocated_stack
+        );
+      }
+    };
+    self.integral_params += 1;
+  }
+
+  fn zero_first_arg(&mut self) {
+    debug_assert!(
+      self.integral_params == 0,
+      "the trampoline would zero the first argument after having overridden it with the second one"
+    );
+    aarch64!(self.assmblr; mov x0, xzr);
+  }
+
+  fn save_out_array_to_preserved_register(&mut self) {
+    let s = &mut self.assmblr;
+    // functions returning 64 bit integers have the out array appended as their last parameter,
+    // and it is a *FastApiTypedArray<Int32>
+    match self.integral_params {
+      // x0 is always V8's receiver
+      0 => aarch64!(s; ldr x19, [x1, 8]),
+      1 => aarch64!(s; ldr x19, [x2, 8]),
+      2 => aarch64!(s; ldr x19, [x3, 8]),
+      3 => aarch64!(s; ldr x19, [x4, 8]),
+      4 => aarch64!(s; ldr x19, [x5, 8]),
+      5 => aarch64!(s; ldr x19, [x6, 8]),
+      6 => aarch64!(s; ldr x19, [x7, 8]),
+      7.. => {
+        aarch64!(s
+          ; ldr x19, [sp, self.offset_trampoline]
+          ; ldr x19, [x19, 8]
+        )
+      }
+    }
+  }
+
+  fn wrap_return_value_in_out_array(&mut self) {
+    aarch64!(self.assmblr; str x0, [x19]);
+  }
+
+  fn save_frame_record(&mut self) {
+    debug_assert!(
+      self.allocated_stack >= 16,
+      "the trampoline would try to save the frame record to the stack without having allocated enough space for it"
+    );
+    aarch64!(self.assmblr
+      // Frame record is stored at the bottom of the stack frame
+      ; stp x29, x30, [sp, self.allocated_stack - 16]
+      ; add x29, sp, self.allocated_stack - 16
+    )
+  }
+
+  fn recover_frame_record(&mut self) {
+    // The stack cannot have been deallocated before the frame record is restored
+    debug_assert!(
+      self.allocated_stack >= 16,
+      "the trampoline would try to load the frame record from the stack, but it couldn't possibly contain it"
+    );
+    // Frame record is stored at the bottom of the stack frame
+    aarch64!(self.assmblr; ldp x29, x30, [sp, self.allocated_stack - 16])
+  }
+
+  fn save_preserved_register_to_stack(&mut self) {
+    // If a preserved register needs to be used, we must have allocated at least 32 bytes in the stack
+    // 16 for the frame record, 8 for the preserved register, and 8 for 16-byte alignment.
+    debug_assert!(
+      self.allocated_stack >= 32,
+      "the trampoline would try to save a register to the stack without having allocated enough space for it"
+    );
+    // preserved register is stored after frame record
+    aarch64!(self.assmblr; str x19, [sp, self.allocated_stack - 24]);
+  }
+
+  fn recover_preserved_register(&mut self) {
+    // The stack cannot have been deallocated before the preserved register is restored
+    // 16 for the frame record, 8 for the preserved register, and 8 for 16-byte alignment.
+    debug_assert!(
+      self.allocated_stack >= 32,
+      "the trampoline would try to recover the value of a register from the stack, but it couldn't possibly contain it"
+    );
+    // preserved register is stored after frame record
+    aarch64!(self.assmblr; ldr x19, [sp, self.allocated_stack - 24]);
+  }
+
+  fn allocate_stack(&mut self, symbol: &Symbol) {
+    // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms:
+    // > Function arguments may consume slots on the stack that are not multiples of 8 bytes.
+    // (i.e. natural alignment instead of eightbyte alignment)
+    let mut int_params = 0u32;
+    let mut float_params = 0u32;
+    let mut stack_size = 0u32;
+    for param in symbol.parameter_types.iter().copied() {
+      match param.into() {
+        Float(float_param) => {
+          float_params += 1;
+          if float_params > Self::FLOAT_REGISTERS {
+            stack_size += float_param.size();
+          }
+        }
+        Int(integral_param) => {
+          int_params += 1;
+          if int_params > Self::INTEGRAL_REGISTERS {
+            stack_size += integral_param.size();
+          }
+        }
+      }
+    }
+
+    // Section 6.2.3 of the Aarch64 PCS:
+    // > Each frame shall link to the frame of its caller by means of a frame record of two 64-bit values on the stack
+    stack_size += 16;
+
+    if self.must_save_preserved_register_to_stack(symbol) {
+      stack_size += 8;
+    }
+
+    // Section 6.2.2 of Aarch64 PCS:
+    // > At any point at which memory is accessed via SP, the hardware requires that
+    // > - SP mod 16 = 0. The stack must be quad-word aligned.
+    // > The stack must also conform to the following constraint at a public interface:
+    // > - SP mod 16 = 0. The stack must be quad-word aligned.
+    stack_size += padding_to_align(16, stack_size);
+
+    if stack_size > 0 {
+      aarch64!(self.assmblr; sub sp, sp, stack_size);
+      self.offset_trampoline += stack_size;
+      // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack
+      self.offset_callee = 0;
+      self.allocated_stack += stack_size;
+    }
+  }
+
+  fn deallocate_stack(&mut self) {
+    if self.allocated_stack > 0 {
+      aarch64!(self.assmblr; add sp, sp, self.allocated_stack);
+      self.allocated_stack = 0;
+    }
+  }
+
+  fn call(&mut self, ptr: *const c_void) {
+    // the stack has been aligned during stack allocation
+    // Frame record has been stored in stack and frame pointer points to it
+    debug_assert!(
+      self.allocated_stack % 16 == 0,
+      "the trampoline would call the FFI function with an unaligned stack"
+    );
+    debug_assert!(
+      self.allocated_stack >= 16,
+      "the trampoline would call the FFI function without allocating enough stack for the frame record"
+    );
+    self.load_callee_address(ptr);
+    aarch64!(self.assmblr; blr x8);
+  }
+
+  fn tailcall(&mut self, ptr: *const c_void) {
+    // stack pointer is never modified and remains aligned
+    // frame pointer and link register remain the one provided by the trampoline's caller (V8)
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would tail call the FFI function with an outstanding stack allocation"
+    );
+    self.load_callee_address(ptr);
+    aarch64!(self.assmblr; br x8);
+  }
+
+  fn ret(&mut self) {
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would return with an outstanding stack allocation"
+    );
+    aarch64!(self.assmblr; ret);
+  }
+
+  fn load_callee_address(&mut self, ptr: *const c_void) {
+    // Like all ARM instructions, move instructions are 32bit long and can fit at most 16bit immediates.
+    // bigger immediates are loaded in multiple steps applying a left-shift modifier
+    let mut address = ptr as u64;
+    let mut imm16 = address & 0xFFFF;
+    aarch64!(self.assmblr; movz x8, imm16 as u32);
+    address >>= 16;
+    let mut shift = 16;
+    while address > 0 {
+      imm16 = address & 0xFFFF;
+      if imm16 != 0 {
+        aarch64!(self.assmblr; movk x8, imm16 as u32, lsl shift);
+      }
+      address >>= 16;
+      shift += 16;
+    }
+  }
+
+  fn is_recv_arg_overridden(&self) -> bool {
+    // V8 receiver is the first parameter of the trampoline function and is a pointer
+    self.integral_params > 0
+  }
+
+  fn must_save_preserved_register_to_stack(&mut self, symbol: &Symbol) -> bool {
+    self.must_wrap_return_value_in_typed_array(symbol.result_type)
+  }
+
+  fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool {
+    // V8 only supports i32 and u32 return types for integers
+    // We support 64 bit integers by wrapping them in a TypedArray out parameter
+    crate::needs_unwrap(rv)
+  }
+
+  fn finalize(self) -> ExecutableBuffer {
+    self.assmblr.finalize().unwrap()
+  }
+}
+
+struct Win64 {
+  // Reference: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-calling-convention.md
+  assmblr: dynasmrt::x64::Assembler,
+  // Params counter (Windows does not distinguish by type with regards to parameter position)
+  params: u32,
+  // Stack offset accumulators
+  offset_trampoline: u32,
+  offset_callee: u32,
+  allocated_stack: u32,
+  frame_pointer: u32,
+}
+
+#[cfg_attr(
+  not(all(target_aarch = "x86_64", target_family = "windows")),
+  allow(dead_code)
+)]
+impl Win64 {
+  // Section "Parameter Passing" of the Windows x64 calling convention:
+  // > By default, the x64 calling convention passes the first four arguments to a function in registers.
+  const REGISTERS: u32 = 4;
+
+  fn new() -> Self {
+    Self {
+      assmblr: dynasmrt::x64::Assembler::new().unwrap(),
+      params: 0,
+      // trampoline caller's return address + trampoline's shadow space
+      offset_trampoline: 8 + 32,
+      offset_callee: 8 + 32,
+      allocated_stack: 0,
+      frame_pointer: 0,
+    }
+  }
+
+  fn compile(sym: &Symbol) -> Trampoline {
+    let mut compiler = Self::new();
+
+    let must_cast_return_value =
+      compiler.must_cast_return_value(sym.result_type);
+    let must_wrap_return_value =
+      compiler.must_wrap_return_value_in_typed_array(sym.result_type);
+    let must_save_preserved_register = must_wrap_return_value;
+    let cannot_tailcall = must_cast_return_value || must_wrap_return_value;
+
+    if cannot_tailcall {
+      if must_save_preserved_register {
+        compiler.save_preserved_register_to_stack();
+      }
+      compiler.allocate_stack(&sym.parameter_types);
+    }
+
+    for param in sym.parameter_types.iter().copied() {
+      compiler.move_left(param)
+    }
+    if !compiler.is_recv_arg_overridden() {
+      // the receiver object should never be expected. Avoid its unexpected or deliberate leak
+      compiler.zero_first_arg();
+    }
+    if must_wrap_return_value {
+      compiler.save_out_array_to_preserved_register();
+    }
+
+    if cannot_tailcall {
+      compiler.call(sym.ptr.as_ptr());
+      if must_cast_return_value {
+        compiler.cast_return_value(sym.result_type);
+      }
+      if must_wrap_return_value {
+        compiler.wrap_return_value_in_out_array();
+      }
+      compiler.deallocate_stack();
+      if must_save_preserved_register {
+        compiler.recover_preserved_register();
+      }
+      compiler.ret();
+    } else {
+      compiler.tailcall(sym.ptr.as_ptr());
+    }
+
+    Trampoline(compiler.finalize())
+  }
+
+  fn move_left(&mut self, param: NativeType) {
+    // Section "Parameter Passing" of the Windows x64 calling convention:
+    // > By default, the x64 calling convention passes the first four arguments to a function in registers.
+    // > The registers used for these arguments depend on the position and type of the argument.
+    // > Remaining arguments get pushed on the stack in right-to-left order.
+    // > [...]
+    // > Integer valued arguments in the leftmost four positions are passed in left-to-right order in RCX, RDX, R8, and R9
+    // > [...]
+    // > Any floating-point and double-precision arguments in the first four parameters are passed in XMM0 - XMM3, depending on position
+    let s = &mut self.assmblr;
+    let param_i = self.params;
+
+    // move each argument one position to the left. The first argument in the stack moves to the last register (r9 or xmm3).
+    // If the FFI function is called with a new stack frame, the arguments remaining in the stack are copied to the new stack frame.
+    // Otherwise, they are copied 8 bytes lower in the same frame
+    match (param_i, param.into()) {
+      // Section "Parameter Passing" of the Windows x64 calling convention:
+      // > All integer arguments in registers are right-justified, so the callee can ignore the upper bits of the register
+      // > and access only the portion of the register necessary.
+      // (i.e. unlike in SysV or Aarch64-Apple, 8/16 bit integers are not expected to be zero/sign extended)
+      (0, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov ecx, edx),
+      (0, Int(U(QW) | I(QW))) => x64!(s; mov rcx, rdx),
+      // The fast API expects buffer arguments passed as a pointer to a FastApiTypedArray<Uint8> struct
+      // Here we blindly follow the layout of https://github.com/denoland/rusty_v8/blob/main/src/fast_api.rs#L190-L200
+      // although that might be problematic: https://discord.com/channels/684898665143206084/956626010248478720/1009450940866252823
+      (0, Int(Buffer)) => x64!(s; mov rcx, [rdx + 8]),
+      // Use movaps for singles and doubles, benefits of smaller encoding outweigh those of using the correct instruction for the type,
+      // which for doubles should technically be movapd
+      (0, Float(_)) => {
+        x64!(s; movaps xmm0, xmm1);
+        self.zero_first_arg();
+      }
+
+      (1, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov edx, r8d),
+      (1, Int(U(QW) | I(QW))) => x64!(s; mov rdx, r8),
+      (1, Int(Buffer)) => x64!(s; mov rdx, [r8 + 8]),
+      (1, Float(_)) => x64!(s; movaps xmm1, xmm2),
+
+      (2, Int(U(B | W | DW) | I(B | W | DW))) => x64!(s; mov r8d, r9d),
+      (2, Int(U(QW) | I(QW))) => x64!(s; mov r8, r9),
+      (2, Int(Buffer)) => x64!(s; mov r8, [r9 + 8]),
+      (2, Float(_)) => x64!(s; movaps xmm2, xmm3),
+
+      (3, param) => {
+        let ot = self.offset_trampoline as i32;
+        match param {
+          Int(U(B | W | DW) | I(B | W | DW)) => {
+            x64!(s; mov r9d, [rsp + ot])
+          }
+          Int(U(QW) | I(QW)) => {
+            x64!(s; mov r9, [rsp + ot])
+          }
+          Int(Buffer) => {
+            x64!(s
+              ; mov r9, [rsp + ot]
+              ; mov r9, [r9 + 8])
+          }
+          Float(_) => {
+            // parameter 4 is always 16-byte aligned, so we can use movaps instead of movups
+            x64!(s; movaps xmm3, [rsp + ot])
+          }
+        }
+        // Section "x64 Aggregate and Union layout" of the windows x64 software conventions doc:
+        // > The alignment of the beginning of a structure or a union is the maximum alignment of any individual member
+        // Ref: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-software-conventions.md#x64-aggregate-and-union-layout
+        self.offset_trampoline += 8;
+      }
+      (4.., param) => {
+        let ot = self.offset_trampoline as i32;
+        let oc = self.offset_callee as i32;
+        match param {
+          Int(U(B | W | DW) | I(B | W | DW)) => {
+            x64!(s
+              ; mov eax, [rsp + ot]
+              ; mov [rsp + oc], eax
+            )
+          }
+          Int(U(QW) | I(QW)) => {
+            x64!(s
+              ; mov rax, [rsp + ot]
+              ; mov [rsp + oc], rax
+            )
+          }
+          Int(Buffer) => {
+            x64!(s
+              ; mov rax, [rsp + ot]
+              ; mov rax, [rax + 8]
+              ; mov [rsp + oc], rax
+            )
+          }
+          Float(_) => {
+            x64!(s
+              ; movups xmm4, [rsp + ot]
+              ; movups [rsp + oc], xmm4
+            )
+          }
+        }
+        // Section "x64 Aggregate and Union layout" of the windows x64 software conventions doc:
+        // > The alignment of the beginning of a structure or a union is the maximum alignment of any individual member
+        // Ref: https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/build/x64-software-conventions.md#x64-aggregate-and-union-layout
+        self.offset_trampoline += 8;
+        self.offset_callee += 8;
+
+        debug_assert!(
+          self.allocated_stack == 0
+            || self.offset_callee <= self.allocated_stack
+        );
+      }
+    }
+    self.params += 1;
+  }
+
+  fn zero_first_arg(&mut self) {
+    debug_assert!(
+      self.params == 0,
+      "the trampoline would zero the first argument after having overridden it with the second one"
+    );
+    x64!(self.assmblr; xor ecx, ecx);
+  }
+
+  fn cast_return_value(&mut self, rv: NativeType) {
+    let s = &mut self.assmblr;
+    // V8 only supports 32bit integers. We support 8 and 16 bit integers casting them to 32bits.
+    // Section "Return Values" of the Windows x64 Calling Convention doc:
+    // > The state of unused bits in the value returned in RAX or XMM0 is undefined.
+    match rv {
+      NativeType::U8 => x64!(s; movzx eax, al),
+      NativeType::I8 => x64!(s; movsx eax, al),
+      NativeType::U16 => x64!(s; movzx eax, ax),
+      NativeType::I16 => x64!(s; movsx eax, ax),
+      _ => (),
+    }
+  }
+
+  fn save_out_array_to_preserved_register(&mut self) {
+    let s = &mut self.assmblr;
+    // functions returning 64 bit integers have the out array appended as their last parameter,
+    // and it is a *FastApiTypedArray<Int32>
+    match self.params {
+      // rcx is always V8 receiver
+      0 => x64!(s; mov rbx, [rdx + 8]),
+      1 => x64!(s; mov rbx, [r8 + 8]),
+      2 => x64!(s; mov rbx, [r9 + 8]),
+      3.. => {
+        x64!(s
+          ; mov rax, [rsp + self.offset_trampoline as i32]
+          ; mov rbx, [rax + 8]
+        )
+      }
+    }
+  }
+
+  fn wrap_return_value_in_out_array(&mut self) {
+    x64!(self.assmblr; mov [rbx], rax)
+  }
+
+  fn save_preserved_register_to_stack(&mut self) {
+    x64!(self.assmblr; push rbx);
+    self.offset_trampoline += 8;
+    // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack
+    self.offset_callee = 0;
+    self.frame_pointer += 8;
+  }
+
+  fn recover_preserved_register(&mut self) {
+    debug_assert!(
+      self.frame_pointer >= 8,
+      "the trampoline would try to pop from the stack beyond its frame pointer"
+    );
+    x64!(self.assmblr; pop rbx);
+    self.frame_pointer -= 8;
+    // parameter offsets are invalid once this method is called
+  }
+
+  fn allocate_stack(&mut self, params: &[NativeType]) {
+    let mut stack_size = 0;
+    // Section "Calling Convention Defaults" of the x64-calling-convention and Section "Stack Allocation" of the stack-usage docs:
+    // > The x64 Application Binary Interface (ABI) uses a four-register fast-call calling convention by default.
+    // > Space is allocated on the call stack as a shadow store for callees to save those registers.
+    // > [...]
+    // > Any parameters beyond the first four must be stored on the stack after the shadow store before the call
+    // > [...]
+    // > Even if the called function has fewer than 4 parameters, these 4 stack locations are effectively owned by the called function,
+    // > and may be used by the called function for other purposes besides saving parameter register values
+    stack_size += max(params.len() as u32, 4) * 8;
+
+    // Align new stack frame (accounting for the 8 byte of the trampoline caller's return address
+    // and any other potential addition to the stack prior to this allocation)
+    // Section "Stack Allocation" of stack-usage docs:
+    // > The stack will always be maintained 16-byte aligned, except within the prolog (for example, after the return address is pushed)
+    stack_size += padding_to_align(16, self.frame_pointer + stack_size + 8);
+
+    x64!(self.assmblr; sub rsp, stack_size as i32);
+    self.offset_trampoline += stack_size;
+    // stack pointer has been modified, and the callee stack parameters are expected at the top of the stack right after the shadow space
+    self.offset_callee = 32;
+    self.allocated_stack += stack_size;
+    self.frame_pointer += stack_size;
+  }
+
+  fn deallocate_stack(&mut self) {
+    debug_assert!(
+      self.frame_pointer >= self.allocated_stack,
+      "the trampoline would try to deallocate stack beyond its frame pointer"
+    );
+    x64!(self.assmblr; add rsp, self.allocated_stack as i32);
+    self.frame_pointer -= self.allocated_stack;
+    self.allocated_stack = 0;
+  }
+
+  fn call(&mut self, ptr: *const c_void) {
+    // the stack has been aligned during stack allocation and/or pushing of preserved registers
+    debug_assert!(
+      (8 + self.frame_pointer) % 16 == 0,
+      "the trampoline would call the FFI function with an unaligned stack"
+    );
+    x64!(self.assmblr
+      ; mov rax, QWORD ptr as _
+      ; call rax
+    );
+  }
+
+  fn tailcall(&mut self, ptr: *const c_void) {
+    // stack pointer is never modified and remains aligned
+    // return address remains the one provided by the trampoline's caller (V8)
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would tail call the FFI function with an outstanding stack allocation"
+    );
+    debug_assert!(
+      self.frame_pointer == 0,
+      "the trampoline would tail call the FFI function with outstanding locals in the frame"
+    );
+    x64!(self.assmblr
+      ; mov rax, QWORD ptr as _
+      ; jmp rax
+    );
+  }
+
+  fn ret(&mut self) {
+    debug_assert!(
+      self.allocated_stack == 0,
+      "the trampoline would return with an outstanding stack allocation"
+    );
+    debug_assert!(
+      self.frame_pointer == 0,
+      "the trampoline would return with outstanding locals in the frame"
+    );
+    x64!(self.assmblr; ret);
+  }
+
+  fn is_recv_arg_overridden(&self) -> bool {
+    self.params > 0
+  }
+
+  fn must_cast_return_value(&self, rv: NativeType) -> bool {
+    // V8 only supports i32 and u32 return types for integers
+    // We support 8 and 16 bit integers by extending them to 32 bits in the trampoline before returning
+    matches!(
+      rv,
+      NativeType::U8 | NativeType::I8 | NativeType::U16 | NativeType::I16
+    )
+  }
+
+  fn must_wrap_return_value_in_typed_array(&self, rv: NativeType) -> bool {
+    // V8 only supports i32 and u32 return types for integers
+    // We support 64 bit integers by wrapping them in a TypedArray out parameter
+    crate::needs_unwrap(rv)
+  }
+
+  fn finalize(self) -> ExecutableBuffer {
+    self.assmblr.finalize().unwrap()
+  }
+}
+
+fn padding_to_align(alignment: u32, size: u32) -> u32 {
+  (alignment - size % alignment) % alignment
+}
+
+#[derive(Clone, Copy, Debug)]
+enum Floating {
+  Single = 4,
+  Double = 8,
+}
+
+impl Floating {
+  fn size(self) -> u32 {
+    self as u32
+  }
+}
+
+use Floating::*;
+
+#[derive(Clone, Copy, Debug)]
+enum Integral {
+  I(Size),
+  U(Size),
+  Buffer,
+}
+
+impl Integral {
+  fn size(self) -> u32 {
+    match self {
+      I(size) | U(size) => size as u32,
+      Buffer => 8,
+    }
+  }
+}
+
+use Integral::*;
+
+#[derive(Clone, Copy, Debug)]
+enum Size {
+  B = 1,
+  W = 2,
+  DW = 4,
+  QW = 8,
+}
+use Size::*;
+
+#[allow(clippy::enum_variant_names)]
+#[derive(Clone, Copy, Debug)]
+enum Param {
+  Int(Integral),
+  Float(Floating),
+}
+
+use Param::*;
+
+impl From<NativeType> for Param {
+  fn from(native: NativeType) -> Self {
+    match native {
+      NativeType::F32 => Float(Single),
+      NativeType::F64 => Float(Double),
+      NativeType::Bool | NativeType::U8 => Int(U(B)),
+      NativeType::U16 => Int(U(W)),
+      NativeType::U32 | NativeType::Void => Int(U(DW)),
+      NativeType::U64
+      | NativeType::USize
+      | NativeType::Pointer
+      | NativeType::Function => Int(U(QW)),
+      NativeType::I8 => Int(I(B)),
+      NativeType::I16 => Int(I(W)),
+      NativeType::I32 => Int(I(DW)),
+      NativeType::I64 | NativeType::ISize => Int(I(QW)),
+      NativeType::Buffer => Int(Buffer),
+    }
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use std::ptr::null_mut;
+
+  use libffi::middle::Type;
+
+  use crate::NativeType;
+  use crate::Symbol;
+
+  fn symbol(parameters: Vec<NativeType>, ret: NativeType) -> Symbol {
+    Symbol {
+      cif: libffi::middle::Cif::new(vec![], Type::void()),
+      ptr: libffi::middle::CodePtr(null_mut()),
+      parameter_types: parameters,
+      result_type: ret,
+      can_callback: false,
+    }
+  }
+
+  mod sysv_amd64 {
+    use std::ops::Deref;
+
+    use dynasmrt::dynasm;
+    use dynasmrt::DynasmApi;
+
+    use super::super::SysVAmd64;
+    use super::symbol;
+    use crate::NativeType::*;
+
+    #[test]
+    fn tailcall() {
+      let trampoline = SysVAmd64::compile(&symbol(
+        vec![
+          U8, U16, I16, I8, U32, U64, Buffer, Function, I64, I32, I16, I8, F32,
+          F32, F32, F32, F64, F64, F64, F64, F32, F64,
+        ],
+        Void,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/KE9x1h9xq
+      dynasm!(assembler
+        ; .arch x64
+        ; movzx edi, sil                   // u8
+        ; movzx esi, dx                    // u16
+        ; movsx edx, cx                    // i16
+        ; movsx ecx, r8b                   // i8
+        ; mov r8d, r9d                     // u32
+        ; mov r9, [DWORD rsp + 8]          // u64
+        ; mov rax, [DWORD rsp + 16]        // Buffer
+        ; mov rax, [rax + 8]               // ..
+        ; mov [DWORD rsp + 8], rax         // ..
+        ; mov rax, [DWORD rsp + 24]        // Function
+        ; mov [DWORD rsp + 16], rax        // ..
+        ; mov rax, [DWORD rsp + 32]        // i64
+        ; mov [DWORD rsp + 24], rax        // ..
+        ; mov eax, [DWORD rsp + 40]        // i32
+        ; mov [DWORD rsp + 32], eax        // ..
+        ; movsx eax, WORD [DWORD rsp + 48] // i16
+        ; mov [DWORD rsp + 40], eax        // ..
+        ; movsx eax, BYTE [DWORD rsp + 56] // i8
+        ; mov [DWORD rsp + 48], eax        // ..
+        ; movss xmm8, [DWORD rsp + 64]     // f32
+        ; movss [DWORD rsp + 56], xmm8     // ..
+        ; movsd xmm8, [DWORD rsp + 72]     // f64
+        ; movsd [DWORD rsp + 64], xmm8     // ..
+        ; mov rax, QWORD 0
+        ; jmp rax
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn integer_casting() {
+      let trampoline = SysVAmd64::compile(&symbol(
+        vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16],
+        I8,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/qo59bPsfv
+      dynasm!(assembler
+        ; .arch x64
+        ; sub rsp, DWORD 56                 // stack allocation
+        ; movzx edi, sil                    // u8
+        ; movzx esi, dx                     // u16
+        ; movsx edx, cl                     // i8
+        ; movsx ecx, r8w                    // i16
+        ; movzx r8d, r9b                    // u8
+        ; movzx r9d, WORD [DWORD rsp + 64]  // u16
+        ; movsx eax, BYTE [DWORD rsp + 72]  // i8
+        ; mov [DWORD rsp + 0], eax          // ..
+        ; movsx eax, WORD [DWORD rsp + 80]  // i16
+        ; mov [DWORD rsp + 8], eax          // ..
+        ; movzx eax, BYTE [DWORD rsp + 88]  // u8
+        ; mov [DWORD rsp + 16], eax         // ..
+        ; movzx eax, WORD [DWORD rsp + 96]  // u16
+        ; mov [DWORD rsp + 24], eax         // ..
+        ; movsx eax, BYTE [DWORD rsp + 104] // i8
+        ; mov [DWORD rsp + 32], eax         // ..
+        ; movsx eax, WORD [DWORD rsp + 112] // i16
+        ; mov [DWORD rsp + 40], eax         // ..
+        ; mov rax, QWORD 0
+        ; call rax
+        ; movsx eax, al      // return value cast
+        ; add rsp, DWORD 56  // stack deallocation
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn buffer_parameters() {
+      let trampoline = SysVAmd64::compile(&symbol(
+        vec![
+          Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
+        ],
+        Void,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/hqv63M3Ko
+      dynasm!(assembler
+        ; .arch x64
+        ; mov rdi, [rsi + 8]               // Buffer
+        ; mov rsi, [rdx + 8]               // Buffer
+        ; mov rdx, [rcx + 8]               // Buffer
+        ; mov rcx, [r8 + 8]                // Buffer
+        ; mov r8, [r9 + 8]                 // Buffer
+        ; mov r9, [DWORD rsp + 8]          // Buffer
+        ; mov r9, [r9 + 8]                 // ..
+        ; mov rax, [DWORD rsp + 16]        // Buffer
+        ; mov rax, [rax + 8]               // ..
+        ; mov [DWORD rsp + 8], rax         // ..
+        ; mov rax, [DWORD rsp + 24]        // Buffer
+        ; mov rax, [rax + 8]               // ..
+        ; mov [DWORD rsp + 16], rax        // ..
+        ; mov rax, QWORD 0
+        ; jmp rax
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_register_typed_array() {
+      let trampoline = SysVAmd64::compile(&symbol(vec![], U64));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/8G7a488o7
+      dynasm!(assembler
+        ; .arch x64
+        ; push rbx
+        ; xor edi, edi       // recv
+        ; mov rbx, [rsi + 8] // save data array pointer to non-volatile register
+        ; mov rax, QWORD 0
+        ; call rax
+        ; mov [rbx], rax     // copy return value to data pointer address
+        ; pop rbx
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_stack_typed_array() {
+      let trampoline = SysVAmd64::compile(&symbol(
+        vec![U64, U64, U64, U64, U64, U64, U64],
+        U64,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/cPnPYWdWq
+      dynasm!(assembler
+        ; .arch x64
+        ; push rbx
+        ; sub rsp, DWORD 16
+        ; mov rdi, rsi              // u64
+        ; mov rsi, rdx              // u64
+        ; mov rdx, rcx              // u64
+        ; mov rcx, r8               // u64
+        ; mov r8, r9                // u64
+        ; mov r9, [DWORD rsp + 32]  // u64
+        ; mov rax, [DWORD rsp + 40] // u64
+        ; mov [DWORD rsp + 0], rax  // ..
+        ; mov rax, [DWORD rsp + 48] // save data array pointer to non-volatile register
+        ; mov rbx, [rax + 8]        // ..
+        ; mov rax, QWORD 0
+        ; call rax
+        ; mov [rbx], rax     // copy return value to data pointer address
+        ; add rsp, DWORD 16
+        ; pop rbx
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+  }
+
+  mod aarch64_apple {
+    use std::ops::Deref;
+
+    use dynasmrt::dynasm;
+
+    use super::super::Aarch64Apple;
+    use super::symbol;
+    use crate::NativeType::*;
+
+    #[test]
+    fn tailcall() {
+      let trampoline = Aarch64Apple::compile(&symbol(
+        vec![
+          U8, U16, I16, I8, U32, U64, Buffer, Function, I64, I32, I16, I8, F32,
+          F32, F32, F32, F64, F64, F64, F64, F32, F64,
+        ],
+        Void,
+      ));
+
+      let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/oefqYWT13
+      dynasm!(assembler
+        ; .arch aarch64
+        ; and w0, w1, 0xFF   // u8
+        ; and w1, w2, 0xFFFF // u16
+        ; sxth w2, w3        // i16
+        ; sxtb w3, w4        // i8
+        ; mov w4, w5         // u32
+        ; mov x5, x6         // u64
+        ; ldr x6, [x7, 8]    // Buffer
+        ; ldr x7, [sp]       // Function
+        ; ldr x8, [sp, 8]    // i64
+        ; str x8, [sp]       // ..
+        ; ldr w8, [sp, 16]   // i32
+        ; str w8, [sp, 8]    // ..
+        ; ldr w8, [sp, 24]   // i16
+        ; strh w8, [sp, 12]  // ..
+        ; ldr w8, [sp, 32]   // i8
+        ; strb w8, [sp, 14]  // ..
+        ; ldr s16, [sp, 40]  // f32
+        ; str s16, [sp, 16]  // ..
+        ; ldr d16, [sp, 48]  // f64
+        ; str d16, [sp, 24]  // ..
+        ; movz x8, 0
+        ; br x8
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn integer_casting() {
+      let trampoline = Aarch64Apple::compile(&symbol(
+        vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16],
+        I8,
+      ));
+
+      let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/7qfzbzobM
+      dynasm!(assembler
+        ; .arch aarch64
+        ; and w0, w1, 0xFF   // u8
+        ; and w1, w2, 0xFFFF // u16
+        ; sxtb w2, w3        // i8
+        ; sxth w3, w4        // i16
+        ; and w4, w5, 0xFF   // u8
+        ; and w5, w6, 0xFFFF // u16
+        ; sxtb w6, w7        // i8
+        ; ldrsh w7, [sp]     // i16
+        ; ldr w8, [sp, 8]    // u8
+        ; strb w8, [sp]      // ..
+        ; ldr w8, [sp, 16]    // u16
+        ; strh w8, [sp, 2]   // ..
+        ; ldr w8, [sp, 24]   // i8
+        ; strb w8, [sp, 4]   // ..
+        ; ldr w8, [sp, 32]   // i16
+        ; strh w8, [sp, 6]   // ..
+        ; movz x8, 0
+        ; br x8
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn buffer_parameters() {
+      let trampoline = Aarch64Apple::compile(&symbol(
+        vec![
+          Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
+          Buffer, Buffer,
+        ],
+        Void,
+      ));
+
+      let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/obd6z6vsf
+      dynasm!(assembler
+        ; .arch aarch64
+        ; ldr x0, [x1, 8]               // Buffer
+        ; ldr x1, [x2, 8]               // Buffer
+        ; ldr x2, [x3, 8]               // Buffer
+        ; ldr x3, [x4, 8]               // Buffer
+        ; ldr x4, [x5, 8]               // Buffer
+        ; ldr x5, [x6, 8]               // Buffer
+        ; ldr x6, [x7, 8]               // Buffer
+        ; ldr x7, [sp]                  // Buffer
+        ; ldr x7, [x7, 8]               // ..
+        ; ldr x8, [sp, 8]               // Buffer
+        ; ldr x8, [x8, 8]               // ..
+        ; str x8, [sp]                  // ..
+        ; ldr x8, [sp, 16]              // Buffer
+        ; ldr x8, [x8, 8]               // ..
+        ; str x8, [sp, 8]               // ..
+        ; movz x8, 0
+        ; br x8
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_register_typed_array() {
+      let trampoline = Aarch64Apple::compile(&symbol(vec![], U64));
+
+      let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/47EvvYb83
+      dynasm!(assembler
+        ; .arch aarch64
+        ; sub sp, sp, 32
+        ; stp x29, x30, [sp, 16]
+        ; add x29, sp, 16
+        ; str x19, [sp, 8]
+        ; mov x0, xzr       // recv
+        ; ldr x19, [x1, 8]  // save data array pointer to non-volatile register
+        ; movz x8, 0
+        ; blr x8
+        ; str x0, [x19]     // copy return value to data pointer address
+        ; ldr x19, [sp, 8]
+        ; ldp x29, x30, [sp, 16]
+        ; add sp, sp, 32
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_stack_typed_array() {
+      let trampoline = Aarch64Apple::compile(&symbol(
+        vec![U64, U64, U64, U64, U64, U64, U64, U64, U8, U8],
+        U64,
+      ));
+
+      let mut assembler = dynasmrt::aarch64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/PvYPbsE1b
+      dynasm!(assembler
+        ; .arch aarch64
+        ; sub sp, sp, 32
+        ; stp x29, x30, [sp, 16]
+        ; add x29, sp, 16
+        ; str x19, [sp, 8]
+        ; mov x0, x1          // u64
+        ; mov x1, x2          // u64
+        ; mov x2, x3          // u64
+        ; mov x3, x4          // u64
+        ; mov x4, x5          // u64
+        ; mov x5, x6          // u64
+        ; mov x6, x7          // u64
+        ; ldr x7, [sp, 32]    // u64
+        ; ldr w8, [sp, 40]    // u8
+        ; strb w8, [sp]        // ..
+        ; ldr w8, [sp, 48]    // u8
+        ; strb w8, [sp, 1]        // ..
+        ; ldr x19, [sp, 56]   // save data array pointer to non-volatile register
+        ; ldr x19, [x19, 8]   // ..
+        ; movz x8, 0
+        ; blr x8
+        ; str x0, [x19]       // copy return value to data pointer address
+        ; ldr x19, [sp, 8]
+        ; ldp x29, x30, [sp, 16]
+        ; add sp, sp, 32
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+  }
+
+  mod x64_windows {
+    use std::ops::Deref;
+
+    use dynasmrt::{dynasm, DynasmApi};
+
+    use super::super::Win64;
+    use super::symbol;
+    use crate::NativeType::*;
+
+    #[test]
+    fn tailcall() {
+      let trampoline =
+        Win64::compile(&symbol(vec![U8, I16, F64, F32, U32, I8, Buffer], Void));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/TYzqrf9aj
+      dynasm!(assembler
+        ; .arch x64
+        ; mov ecx, edx                  // u8
+        ; mov edx, r8d                  // i16
+        ; movaps xmm2, xmm3             // f64
+        ; movaps xmm3, [DWORD rsp + 40] // f32
+        ; mov eax, [DWORD rsp + 48]     // u32
+        ; mov [DWORD rsp + 40], eax     // ..
+        ; mov eax, [DWORD rsp + 56]     // i8
+        ; mov [DWORD rsp + 48], eax     // ..
+        ; mov rax, [DWORD rsp + 64]     // Buffer
+        ; mov rax, [rax + 8]            // ..
+        ; mov [DWORD rsp + 56], rax     // ..
+        ; mov rax, QWORD 0
+        ; jmp rax
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn integer_casting() {
+      let trampoline = Win64::compile(&symbol(
+        vec![U8, U16, I8, I16, U8, U16, I8, I16, U8, U16, I8, I16],
+        I8,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/KMx56KGTq
+      dynasm!(assembler
+        ; .arch x64
+        ; sub rsp, DWORD 104          // stack allocation
+        ; mov ecx, edx                // u8
+        ; mov edx, r8d                // u16
+        ; mov r8d, r9d                // i8
+        ; mov r9d, [DWORD rsp + 144]  // i16
+        ; mov eax, [DWORD rsp + 152]  // u8
+        ; mov [DWORD rsp + 32], eax   // ..
+        ; mov eax, [DWORD rsp + 160]  // u16
+        ; mov [DWORD rsp + 40], eax   // u16
+        ; mov eax, [DWORD rsp + 168]  // i8
+        ; mov [DWORD rsp + 48], eax   // ..
+        ; mov eax, [DWORD rsp + 176]  // i16
+        ; mov [DWORD rsp + 56], eax   // ..
+        ; mov eax, [DWORD rsp + 184]  // u8
+        ; mov [DWORD rsp + 64], eax   // ..
+        ; mov eax, [DWORD rsp + 192]  // u16
+        ; mov [DWORD rsp + 72], eax   // ..
+        ; mov eax, [DWORD rsp + 200]  // i8
+        ; mov [DWORD rsp + 80], eax   // ..
+        ; mov eax, [DWORD rsp + 208]  // i16
+        ; mov [DWORD rsp + 88], eax   // ..
+        ; mov rax, QWORD 0
+        ; call rax
+        ; movsx eax, al       // return value cast
+        ; add rsp, DWORD 104  // stack deallocation
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn buffer_parameters() {
+      let trampoline = Win64::compile(&symbol(
+        vec![Buffer, Buffer, Buffer, Buffer, Buffer, Buffer],
+        Void,
+      ));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/TYzqrf9aj
+      dynasm!(assembler
+        ; .arch x64
+        ; mov rcx, [rdx + 8]               // Buffer
+        ; mov rdx, [r8 + 8]                // Buffer
+        ; mov r8, [r9 + 8]                 // Buffer
+        ; mov r9, [DWORD rsp + 40]         // Buffer
+        ; mov r9, [r9 + 8]                 // ..
+        ; mov rax, [DWORD rsp + 48]        // Buffer
+        ; mov rax, [rax + 8]               // ..
+        ; mov [DWORD rsp + 40], rax        // ..
+        ; mov rax, [DWORD rsp + 56]        // Buffer
+        ; mov rax, [rax + 8]               // ..
+        ; mov [DWORD rsp + 48], rax        // ..
+        ; mov rax, QWORD 0
+        ; jmp rax
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_register_typed_array() {
+      let trampoline = Win64::compile(&symbol(vec![], U64));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/7EnPE7o3T
+      dynasm!(assembler
+        ; .arch x64
+        ; push rbx
+        ; sub rsp, DWORD 32
+        ; xor ecx, ecx       // recv
+        ; mov rbx, [rdx + 8] // save data array pointer to non-volatile register
+        ; mov rax, QWORD 0
+        ; call rax
+        ; mov [rbx], rax     // copy return value to data pointer address
+        ; add rsp, DWORD 32
+        ; pop rbx
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+
+    #[test]
+    fn return_u64_in_stack_typed_array() {
+      let trampoline =
+        Win64::compile(&symbol(vec![U64, U64, U64, U64, U64], U64));
+
+      let mut assembler = dynasmrt::x64::Assembler::new().unwrap();
+      // See https://godbolt.org/z/3966sfEex
+      dynasm!(assembler
+        ; .arch x64
+        ; push rbx
+        ; sub rsp, DWORD 48
+        ; mov rcx, rdx               // u64
+        ; mov rdx, r8                // u64
+        ; mov r8, r9                 // u64
+        ; mov r9, [DWORD rsp + 96]   // u64
+        ; mov rax, [DWORD rsp + 104] // u64
+        ; mov [DWORD rsp + 32], rax  // ..
+        ; mov rax, [DWORD rsp + 112] // save data array pointer to non-volatile register
+        ; mov rbx, [rax + 8]         // ..
+        ; mov rax, QWORD 0
+        ; call rax
+        ; mov [rbx], rax             // copy return value to data pointer address
+        ; add rsp, DWORD 48
+        ; pop rbx
+        ; ret
+      );
+      let expected = assembler.finalize().unwrap();
+      assert_eq!(trampoline.0.deref(), expected.deref());
+    }
+  }
+}
diff --git a/ext/ffi/jit_trampoline.rs b/ext/ffi/jit_trampoline.rs
deleted file mode 100644
index 6cb8ec74c2..0000000000
--- a/ext/ffi/jit_trampoline.rs
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
-
-use crate::NativeType;
-use crate::{tcc::Compiler, Symbol};
-use std::ffi::c_void;
-use std::ffi::CString;
-use std::fmt::Write as _;
-use std::mem::size_of;
-
-const _: () = assert!(size_of::<fn()>() == size_of::<usize>());
-
-pub(crate) struct Allocation {
-  pub addr: *mut c_void,
-  _ctx: Compiler,
-  _sym: Box<Symbol>,
-}
-
-macro_rules! cstr {
-  ($st:expr) => {
-    &CString::new($st).unwrap()
-  };
-}
-
-fn native_arg_to_c(ty: &NativeType) -> &'static str {
-  match ty {
-    NativeType::Bool => "bool",
-    NativeType::U8 | NativeType::U16 | NativeType::U32 => "uint32_t",
-    NativeType::I8 | NativeType::I16 | NativeType::I32 => "int32_t",
-    NativeType::Void => "void",
-    NativeType::F32 => "float",
-    NativeType::F64 => "double",
-    NativeType::U64 => "uint64_t",
-    NativeType::I64 => "int64_t",
-    NativeType::ISize => "intptr_t",
-    NativeType::USize => "uintptr_t",
-    NativeType::Buffer => "struct FastApiTypedArray*",
-    NativeType::Function | NativeType::Pointer => "void*",
-  }
-}
-
-fn native_to_c(ty: &NativeType) -> &'static str {
-  match ty {
-    NativeType::Bool => "bool",
-    NativeType::U8 => "uint8_t",
-    NativeType::U16 => "uint16_t",
-    NativeType::U32 => "uint32_t",
-    NativeType::I8 => "int8_t",
-    NativeType::I16 => "uint16_t",
-    NativeType::I32 => "int32_t",
-    NativeType::Void => "void",
-    NativeType::F32 => "float",
-    NativeType::F64 => "double",
-    NativeType::U64 => "uint64_t",
-    NativeType::I64 => "int64_t",
-    NativeType::ISize => "intptr_t",
-    NativeType::USize => "uintptr_t",
-    NativeType::Pointer | NativeType::Buffer | NativeType::Function => "void*",
-  }
-}
-
-pub(crate) fn codegen(sym: &crate::Symbol) -> String {
-  let mut c = String::from(include_str!("prelude.h"));
-  let needs_unwrap = crate::needs_unwrap(sym.result_type);
-
-  // Return type of the FFI call.
-  let ffi_ret = native_to_c(&sym.result_type);
-  // Return type of the trampoline.
-  let ret = if needs_unwrap { "void" } else { ffi_ret };
-
-  // extern <return_type> func(
-  let _ = write!(c, "\nextern {ffi_ret} func(");
-  // <param_type> p0, <param_type> p1, ...);
-  for (i, ty) in sym.parameter_types.iter().enumerate() {
-    if i > 0 {
-      c += ", ";
-    }
-    c += native_to_c(ty);
-    let _ = write!(c, " p{i}");
-  }
-  c += ");\n\n";
-
-  // void* recv, <param_type> p0, <param_type> p1, ...);
-  c += ret;
-  c += " func_trampoline(";
-  c += "void* recv";
-  for (i, ty) in sym.parameter_types.iter().enumerate() {
-    c += ", ";
-    c += native_arg_to_c(ty);
-    let _ = write!(c, " p{i}");
-  }
-  if needs_unwrap {
-    let _ = write!(c, ", struct FastApiTypedArray* const p_ret");
-  }
-  c += ") {\n";
-  // func(p0, p1, ...);
-  let mut call_s = String::from("func(");
-  {
-    for (i, ty) in sym.parameter_types.iter().enumerate() {
-      if i > 0 {
-        call_s += ", ";
-      }
-      if matches!(ty, NativeType::Buffer) {
-        let _ = write!(call_s, "p{i}->data");
-      } else {
-        let _ = write!(call_s, "p{i}");
-      }
-    }
-    call_s += ");\n";
-  }
-  if needs_unwrap {
-    // <return_type> r = func(p0, p1, ...);
-    // ((<return_type>*)p_ret->data)[0] = r;
-    let _ = write!(c, " {ffi_ret} r = {call_s}");
-    let _ = writeln!(c, " (({ffi_ret}*)p_ret->data)[0] = r;");
-  } else {
-    // return func(p0, p1, ...);
-    let _ = write!(c, "  return {call_s}");
-  }
-  c += "}\n\n";
-  c
-}
-
-pub(crate) fn gen_trampoline(
-  sym: Box<crate::Symbol>,
-) -> Result<Box<Allocation>, ()> {
-  let mut ctx = Compiler::new()?;
-  ctx.set_options(cstr!("-nostdlib"));
-  // SAFETY: symbol satisfies ABI requirement.
-  unsafe { ctx.add_symbol(cstr!("func"), sym.ptr.0 as *const c_void) };
-  let c = codegen(&sym);
-  ctx.compile_string(cstr!(c))?;
-  let alloc = Allocation {
-    addr: ctx.relocate_and_get_symbol(cstr!("func_trampoline"))?,
-    _ctx: ctx,
-    _sym: sym,
-  };
-  Ok(Box::new(alloc))
-}
-
-#[cfg(test)]
-mod tests {
-  use super::*;
-  use libffi::middle::Type;
-  use std::ptr::null_mut;
-
-  fn codegen(parameters: Vec<NativeType>, ret: NativeType) -> String {
-    let sym = Box::new(crate::Symbol {
-      cif: libffi::middle::Cif::new(vec![], Type::void()),
-      ptr: libffi::middle::CodePtr(null_mut()),
-      parameter_types: parameters,
-      result_type: ret,
-      can_callback: false,
-    });
-    super::codegen(&sym)
-  }
-
-  const PRELUDE: &str = include_str!("prelude.h");
-  fn assert_codegen(expected: String, actual: &str) {
-    assert_eq!(expected, format!("{PRELUDE}\n{}", actual))
-  }
-
-  #[test]
-  fn test_gen_trampoline() {
-    assert_codegen(
-      codegen(vec![], NativeType::Void),
-      "extern void func();\n\n\
-      void func_trampoline(void* recv) {\
-        \n  return func();\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::U32, NativeType::U32], NativeType::U32),
-      "extern uint32_t func(uint32_t p0, uint32_t p1);\n\n\
-      uint32_t func_trampoline(void* recv, uint32_t p0, uint32_t p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::I32, NativeType::I32], NativeType::I32),
-      "extern int32_t func(int32_t p0, int32_t p1);\n\n\
-      int32_t func_trampoline(void* recv, int32_t p0, int32_t p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::F32, NativeType::F32], NativeType::F32),
-      "extern float func(float p0, float p1);\n\n\
-      float func_trampoline(void* recv, float p0, float p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::F64, NativeType::F64], NativeType::F64),
-      "extern double func(double p0, double p1);\n\n\
-      double func_trampoline(void* recv, double p0, double p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::Buffer, NativeType::U32], NativeType::U32),
-      "extern uint32_t func(void* p0, uint32_t p1);\n\n\
-      uint32_t func_trampoline(void* recv, struct FastApiTypedArray* p0, uint32_t p1) {\
-        \n  return func(p0->data, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::Buffer, NativeType::Buffer], NativeType::U32),
-      "extern uint32_t func(void* p0, void* p1);\n\n\
-      uint32_t func_trampoline(void* recv, struct FastApiTypedArray* p0, struct FastApiTypedArray* p1) {\
-        \n  return func(p0->data, p1->data);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![], NativeType::U64),
-      "extern uint64_t func();\n\n\
-      void func_trampoline(void* recv, struct FastApiTypedArray* const p_ret) {\
-        \n uint64_t r = func();\
-        \n ((uint64_t*)p_ret->data)[0] = r;\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::Buffer, NativeType::Buffer], NativeType::U64),
-      "extern uint64_t func(void* p0, void* p1);\n\n\
-      void func_trampoline(void* recv, struct FastApiTypedArray* p0, struct FastApiTypedArray* p1, struct FastApiTypedArray* const p_ret) {\
-        \n uint64_t r = func(p0->data, p1->data);\
-        \n ((uint64_t*)p_ret->data)[0] = r;\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::Pointer, NativeType::Pointer], NativeType::U64),
-      "extern uint64_t func(void* p0, void* p1);\n\n\
-      void func_trampoline(void* recv, void* p0, void* p1, struct FastApiTypedArray* const p_ret) {\
-        \n uint64_t r = func(p0, p1);\
-        \n ((uint64_t*)p_ret->data)[0] = r;\n\
-      }\n\n",
-    );
-  }
-
-  #[test]
-  fn test_gen_trampoline_implicit_cast() {
-    assert_codegen(
-      codegen(vec![NativeType::I8, NativeType::U8], NativeType::I8),
-      "extern int8_t func(int8_t p0, uint8_t p1);\n\n\
-      int8_t func_trampoline(void* recv, int32_t p0, uint32_t p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::ISize, NativeType::U64], NativeType::Void),
-      "extern void func(intptr_t p0, uint64_t p1);\n\n\
-      void func_trampoline(void* recv, intptr_t p0, uint64_t p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-    assert_codegen(
-      codegen(vec![NativeType::USize, NativeType::USize], NativeType::U32),
-      "extern uint32_t func(uintptr_t p0, uintptr_t p1);\n\n\
-      uint32_t func_trampoline(void* recv, uintptr_t p0, uintptr_t p1) {\
-        \n  return func(p0, p1);\n\
-      }\n\n",
-    );
-  }
-}
diff --git a/ext/ffi/lib.rs b/ext/ffi/lib.rs
index ead9be2839..b93638c886 100644
--- a/ext/ffi/lib.rs
+++ b/ext/ffi/lib.rs
@@ -14,7 +14,6 @@ use deno_core::serde_json::json;
 use deno_core::serde_json::Value;
 use deno_core::serde_v8;
 use deno_core::v8;
-use deno_core::v8::fast_api;
 use deno_core::Extension;
 use deno_core::OpState;
 use deno_core::Resource;
@@ -39,15 +38,11 @@ use std::ptr;
 use std::rc::Rc;
 use std::sync::mpsc::sync_channel;
 
-#[cfg(not(target_os = "windows"))]
-mod jit_trampoline;
-#[cfg(not(target_os = "windows"))]
-mod tcc;
+mod fast_call;
 
 #[cfg(not(target_pointer_width = "64"))]
 compile_error!("platform not supported");
 
-// Assert assumptions made in `prelude.h`
 const _: () = {
   assert!(size_of::<c_char>() == 1);
   assert!(size_of::<c_short>() == 2);
@@ -90,8 +85,6 @@ struct Symbol {
   ptr: libffi::middle::CodePtr,
   parameter_types: Vec<NativeType>,
   result_type: NativeType,
-  // This is dead code only on Windows
-  #[allow(dead_code)]
   can_callback: bool,
 }
 
@@ -729,50 +722,6 @@ where
   ))
 }
 
-pub struct FfiFastCallTemplate {
-  args: Box<[fast_api::Type]>,
-  ret: fast_api::CType,
-  symbol_ptr: *const c_void,
-}
-
-impl fast_api::FastFunction for FfiFastCallTemplate {
-  fn function(&self) -> *const c_void {
-    self.symbol_ptr
-  }
-
-  fn args(&self) -> &'static [fast_api::Type] {
-    Box::leak(self.args.clone())
-  }
-
-  fn return_type(&self) -> fast_api::CType {
-    self.ret
-  }
-}
-
-impl From<&NativeType> for fast_api::Type {
-  fn from(native_type: &NativeType) -> Self {
-    match native_type {
-      NativeType::Bool => fast_api::Type::Bool,
-      NativeType::U8 | NativeType::U16 | NativeType::U32 => {
-        fast_api::Type::Uint32
-      }
-      NativeType::I8 | NativeType::I16 | NativeType::I32 => {
-        fast_api::Type::Int32
-      }
-      NativeType::F32 => fast_api::Type::Float32,
-      NativeType::F64 => fast_api::Type::Float64,
-      NativeType::Void => fast_api::Type::Void,
-      NativeType::I64 => fast_api::Type::Int64,
-      NativeType::U64 => fast_api::Type::Uint64,
-      NativeType::ISize => fast_api::Type::Int64,
-      NativeType::USize | NativeType::Pointer | NativeType::Function => {
-        fast_api::Type::Uint64
-      }
-      NativeType::Buffer => fast_api::Type::TypedArray(fast_api::CType::Uint8),
-    }
-  }
-}
-
 fn needs_unwrap(rv: NativeType) -> bool {
   matches!(
     rv,
@@ -796,42 +745,6 @@ fn make_sync_fn<'s>(
   scope: &mut v8::HandleScope<'s>,
   sym: Box<Symbol>,
 ) -> v8::Local<'s, v8::Function> {
-  #[cfg(not(target_os = "windows"))]
-  let mut fast_ffi_templ: Option<FfiFastCallTemplate> = None;
-
-  #[cfg(target_os = "windows")]
-  let fast_ffi_templ: Option<FfiFastCallTemplate> = None;
-
-  #[cfg(not(target_os = "windows"))]
-  let mut fast_allocations: Option<*mut ()> = None;
-  #[cfg(not(target_os = "windows"))]
-  if !sym.can_callback {
-    let needs_unwrap = needs_unwrap(sym.result_type);
-    let ret = match needs_unwrap {
-      true => fast_api::Type::Void,
-      false => fast_api::Type::from(&sym.result_type),
-    };
-
-    let mut args = sym
-      .parameter_types
-      .iter()
-      .map(|t| t.into())
-      .collect::<Vec<_>>();
-    // recv
-    args.insert(0, fast_api::Type::V8Value);
-    if needs_unwrap {
-      args.push(fast_api::Type::TypedArray(fast_api::CType::Int32));
-    }
-    let symbol_trampoline =
-      jit_trampoline::gen_trampoline(sym.clone()).expect("gen_trampoline");
-    fast_ffi_templ = Some(FfiFastCallTemplate {
-      args: args.into_boxed_slice(),
-      ret: (&ret).into(),
-      symbol_ptr: symbol_trampoline.addr,
-    });
-    fast_allocations = Some(Box::into_raw(symbol_trampoline) as *mut ());
-  }
-
   let sym = Box::leak(sym);
   let builder = v8::FunctionTemplate::builder(
     |scope: &mut v8::HandleScope,
@@ -891,8 +804,17 @@ fn make_sync_fn<'s>(
   )
   .data(v8::External::new(scope, sym as *mut Symbol as *mut _).into());
 
-  let func = if let Some(fast_ffi_templ) = fast_ffi_templ {
-    builder.build_fast(scope, &fast_ffi_templ, None)
+  let mut fast_call_alloc = None;
+
+  let func = if fast_call::is_compatible(sym) {
+    let trampoline = fast_call::compile_trampoline(sym);
+    let func = builder.build_fast(
+      scope,
+      &fast_call::make_template(sym, &trampoline),
+      None,
+    );
+    fast_call_alloc = Some(Box::into_raw(Box::new(trampoline)));
+    func
   } else {
     builder.build(scope)
   };
@@ -904,12 +826,12 @@ fn make_sync_fn<'s>(
     Box::new(move |_| {
       // SAFETY: This is never called twice. pointer obtained
       // from Box::into_raw, hence, satisfies memory layout requirements.
-      unsafe {
-        Box::from_raw(sym);
-        #[cfg(not(target_os = "windows"))]
-        if let Some(fast_allocations) = fast_allocations {
-          Box::from_raw(fast_allocations as *mut jit_trampoline::Allocation);
-        }
+      let _ = unsafe { Box::from_raw(sym) };
+      if let Some(fast_call_ptr) = fast_call_alloc {
+        // fast-call compiled trampoline is unmapped when the MMAP handle is dropped
+        // SAFETY: This is never called twice. pointer obtained
+        // from Box::into_raw, hence, satisfies memory layout requirements.
+        let _ = unsafe { Box::from_raw(fast_call_ptr) };
       }
     }),
   );
diff --git a/ext/ffi/prelude.h b/ext/ffi/prelude.h
deleted file mode 100644
index 2da1e65238..0000000000
--- a/ext/ffi/prelude.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
-
-/* Boolean type */
-
-#ifndef _STDBOOL_H
-#define _STDBOOL_H
-
-#define bool _Bool
-#define true 1
-#define false 0
-
-#endif
-
-/* Exact integral types.  */
-
-/* Signed.  */
-typedef signed char int8_t;
-typedef short int int16_t;
-typedef int int32_t;
-typedef long int int64_t;
-
-/* Unsigned.  */
-typedef unsigned char uint8_t;
-typedef unsigned short int uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long int uint64_t;
-
-/* Types for `void *' pointers.  */
-typedef long int intptr_t;
-typedef unsigned long int uintptr_t;
-
-// https://source.chromium.org/chromium/chromium/src/+/main:v8/include/v8-fast-api-calls.h;l=336
-struct FastApiTypedArray {
-  uintptr_t length_;
-  void* data;
-};
diff --git a/ext/ffi/tcc.rs b/ext/ffi/tcc.rs
deleted file mode 100644
index de7c719602..0000000000
--- a/ext/ffi/tcc.rs
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright 2018-2022 the Deno authors. All rights reserved. MIT license.
-
-use std::{
-  ffi::CStr,
-  marker::PhantomData,
-  os::raw::{c_char, c_int, c_void},
-  ptr::null_mut,
-};
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct TCCState {
-  _unused: [u8; 0],
-}
-pub const TCC_OUTPUT_MEMORY: i32 = 1;
-
-extern "C" {
-  pub fn tcc_new() -> *mut TCCState;
-  pub fn tcc_delete(s: *mut TCCState);
-  pub fn tcc_set_options(s: *mut TCCState, str: *const c_char);
-  pub fn tcc_compile_string(s: *mut TCCState, buf: *const c_char) -> c_int;
-  pub fn tcc_add_symbol(
-    s: *mut TCCState,
-    name: *const c_char,
-    val: *const c_void,
-  ) -> c_int;
-  pub fn tcc_set_output_type(s: *mut TCCState, output_type: c_int) -> c_int;
-  pub fn tcc_relocate(s1: *mut TCCState, ptr: *mut c_void) -> c_int;
-  pub fn tcc_get_symbol(s: *mut TCCState, name: *const c_char) -> *mut c_void;
-}
-
-/// Compilation context.
-pub struct Compiler {
-  inner: *mut TCCState,
-  _phantom: PhantomData<TCCState>,
-  pub bin: Option<Vec<u8>>,
-}
-
-impl Compiler {
-  pub fn new() -> Result<Self, ()> {
-    // SAFETY: There is one context per thread.
-    let inner = unsafe { tcc_new() };
-    if inner.is_null() {
-      Err(())
-    } else {
-      let ret =
-        // SAFETY: set output to memory.
-        unsafe { tcc_set_output_type(inner, TCC_OUTPUT_MEMORY as c_int) };
-      assert_eq!(ret, 0);
-      Ok(Self {
-        inner,
-        _phantom: PhantomData,
-        bin: None,
-      })
-    }
-  }
-
-  pub fn set_options(&mut self, option: &CStr) -> &mut Self {
-    // SAFETY: option is a null-terminated C string.
-    unsafe {
-      tcc_set_options(self.inner, option.as_ptr());
-    }
-    self
-  }
-
-  pub fn compile_string(&mut self, p: &CStr) -> Result<(), ()> {
-    // SAFETY: p is a null-terminated C string.
-    let ret = unsafe { tcc_compile_string(self.inner, p.as_ptr()) };
-    if ret == 0 {
-      Ok(())
-    } else {
-      Err(())
-    }
-  }
-
-  /// # Safety
-  /// Symbol need satisfy ABI requirement.
-  pub unsafe fn add_symbol(&mut self, sym: &CStr, val: *const c_void) {
-    // SAFETY: sym is a null-terminated C string.
-    let ret = tcc_add_symbol(self.inner, sym.as_ptr(), val);
-    assert_eq!(ret, 0);
-  }
-
-  pub fn relocate_and_get_symbol(
-    &mut self,
-    sym: &CStr,
-  ) -> Result<*mut c_void, ()> {
-    // SAFETY: pass null ptr to get required length
-    let len = unsafe { tcc_relocate(self.inner, null_mut()) };
-    if len == -1 {
-      return Err(());
-    };
-    let mut bin = Vec::with_capacity(len as usize);
-    let ret =
-      // SAFETY: bin is allocated up to len.
-      unsafe { tcc_relocate(self.inner, bin.as_mut_ptr() as *mut c_void) };
-    if ret != 0 {
-      return Err(());
-    }
-    // SAFETY: if ret == 0, bin is initialized.
-    unsafe {
-      bin.set_len(len as usize);
-    }
-    self.bin = Some(bin);
-    // SAFETY: sym is a null-terminated C string.
-    let addr = unsafe { tcc_get_symbol(self.inner, sym.as_ptr()) };
-    Ok(addr)
-  }
-}
-
-impl Drop for Compiler {
-  fn drop(&mut self) {
-    // SAFETY: delete state from tcc_new()
-    unsafe { tcc_delete(self.inner) };
-  }
-}
diff --git a/ext/ffi/tinycc b/ext/ffi/tinycc
deleted file mode 160000
index afc136262e..0000000000
--- a/ext/ffi/tinycc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit afc136262e93ae85fb3643005b36dbfc30d99c42
diff --git a/test_ffi/Cargo.toml b/test_ffi/Cargo.toml
index cc7708fbc2..38dd86b3ea 100644
--- a/test_ffi/Cargo.toml
+++ b/test_ffi/Cargo.toml
@@ -11,4 +11,5 @@ publish = false
 crate-type = ["cdylib"]
 
 [dev-dependencies]
+pretty_assertions = "1.2.1"
 test_util = { path = "../test_util" }
diff --git a/test_ffi/src/lib.rs b/test_ffi/src/lib.rs
index 812b563ef7..be0d2e42f8 100644
--- a/test_ffi/src/lib.rs
+++ b/test_ffi/src/lib.rs
@@ -258,6 +258,60 @@ pub extern "C" fn call_stored_function_thread_safe_and_log() {
   });
 }
 
+#[no_mangle]
+pub extern "C" fn log_many_parameters(
+  a: u8,
+  b: u16,
+  c: u32,
+  d: u64,
+  e: f64,
+  f: f32,
+  g: i64,
+  h: i32,
+  i: i16,
+  j: i8,
+  k: isize,
+  l: usize,
+  m: f64,
+  n: f32,
+  o: f64,
+  p: f32,
+  q: f64,
+  r: f32,
+  s: f64,
+) {
+  println!("{a} {b} {c} {d} {e} {f} {g} {h} {i} {j} {k} {l} {m} {n} {o} {p} {q} {r} {s}");
+}
+
+#[no_mangle]
+pub extern "C" fn cast_u8_u32(x: u8) -> u32 {
+  x as u32
+}
+
+#[no_mangle]
+pub extern "C" fn cast_u32_u8(x: u32) -> u8 {
+  x as u8
+}
+
+#[no_mangle]
+pub extern "C" fn add_many_u16(
+  a: u16,
+  b: u16,
+  c: u16,
+  d: u16,
+  e: u16,
+  f: u16,
+  g: u16,
+  h: u16,
+  i: u16,
+  j: u16,
+  k: u16,
+  l: u16,
+  m: u16,
+) -> u16 {
+  a + b + c + d + e + f + g + h + i + j + k + l + m
+}
+
 // FFI performance helper functions
 #[no_mangle]
 pub extern "C" fn nop() {}
diff --git a/test_ffi/tests/integration_tests.rs b/test_ffi/tests/integration_tests.rs
index 88da8a0b99..6b48534537 100644
--- a/test_ffi/tests/integration_tests.rs
+++ b/test_ffi/tests/integration_tests.rs
@@ -80,6 +80,10 @@ fn basic() {
     579.912\n\
     true\n\
     false\n\
+    579.9119873046875\n\
+    579.9119873046875\n\
+    579.912\n\
+    579.912\n\
     579\n\
     8589934590\n\
     -8589934590\n\
@@ -105,6 +109,14 @@ fn basic() {
     buf: [1, 2, 3, 4, 5, 6, 7, 8]\n\
     logCallback\n\
     30\n\
+    255 65535 4294967295 4294967296 123.456 789.876 -1 -2 -3 -4 -1000 1000 12345.67891 12345.679 12345.67891 12345.679 12345.67891 12345.679 12345.67891\n\
+    255 65535 4294967295 4294967296 123.456 789.876 -1 -2 -3 -4 -1000 1000 12345.67891 12345.679 12345.67891 12345.679 12345.67891 12345.679 12345.67891\n\
+    0\n\
+    0\n\
+    0\n\
+    0\n\
+    78\n\
+    78\n\
     STORED_FUNCTION cleared\n\
     STORED_FUNCTION_2 cleared\n\
     Thread safe call counter: 0\n\
@@ -120,6 +132,8 @@ fn basic() {
     uint32Array[0]: 42\n\
     uint32Array[0] after mutation: 55\n\
     Static ptr value after mutation: 55\n\
+    2264956937\n\
+    2264956937\n\
     Correct number of resources\n";
   assert_eq!(stdout, expected);
   assert_eq!(stderr, "");
diff --git a/test_ffi/tests/test.js b/test_ffi/tests/test.js
index f71f23adc8..6bf3c47f82 100644
--- a/test_ffi/tests/test.js
+++ b/test_ffi/tests/test.js
@@ -6,6 +6,7 @@
 import { assertEquals } from "https://deno.land/std@0.149.0/testing/asserts.ts";
 import {
   assertThrows,
+  assert,
 } from "../../test_util/std/testing/asserts.ts";
 
 const targetDir = Deno.execPath().replace(/[^\/\\]+$/, "");
@@ -175,6 +176,22 @@ const dylib = Deno.dlopen(libPath, {
     result: "void",
     callback: true,
   },
+  log_many_parameters: {
+    parameters: ["u8", "u16", "u32", "u64", "f64", "f32", "i64", "i32", "i16", "i8", "isize", "usize", "f64", "f32", "f64", "f32", "f64", "f32", "f64"],
+    result: "void",
+  },
+  cast_u8_u32: {
+    parameters: ["u8"],
+    result: "u32",
+  },
+  cast_u32_u8: {
+    parameters: ["u32"],
+    result: "u8",
+  },
+  add_many_u16: {
+    parameters: ["u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16", "u16"],
+    result: "u16",
+  },
   // Statics
   "static_u32": {
     type: "u32",
@@ -191,6 +208,7 @@ const dylib = Deno.dlopen(libPath, {
   "static_char": {
     type: "pointer",
   },
+  "hash": { parameters: ["buffer", "u32"], result: "u32" },
 });
 const { symbols } = dylib;
 
@@ -210,11 +228,7 @@ function returnBuffer() { return return_buffer(); };
 returnBuffer();
 %OptimizeFunctionOnNextCall(returnBuffer);
 const ptr0 = returnBuffer();
-
-const status = %GetOptimizationStatus(returnBuffer);
-if (!(status & (1 << 4))) {
-  throw new Error("returnBuffer is not optimized");
-}
+assertIsOptimized(returnBuffer);
 
 dylib.symbols.print_pointer(ptr0, 8);
 const ptrView = new Deno.UnsafePointerView(ptr0);
@@ -266,17 +280,10 @@ const { add_u32, add_usize_fast } = symbols;
 function addU32Fast(a, b) {
   return add_u32(a, b);
 };
-
-%PrepareFunctionForOptimization(addU32Fast);
-console.log(addU32Fast(123, 456));
-%OptimizeFunctionOnNextCall(addU32Fast);
-console.log(addU32Fast(123, 456));
+testOptimized(addU32Fast, () => addU32Fast(123, 456));
 
 function addU64Fast(a, b) { return add_usize_fast(a, b); };
-%PrepareFunctionForOptimization(addU64Fast);
-console.log(addU64Fast(2, 3));
-%OptimizeFunctionOnNextCall(addU64Fast);
-console.log(addU64Fast(2, 3));
+testOptimized(addU64Fast, () => addU64Fast(2, 3));
 
 console.log(dylib.symbols.add_i32(123, 456));
 console.log(dylib.symbols.add_u64(0xffffffffn, 0xffffffffn));
@@ -294,6 +301,16 @@ console.log(dylib.symbols.add_f64(123.123, 456.789));
 console.log(dylib.symbols.and(true, true));
 console.log(dylib.symbols.and(true, false));
 
+function addF32Fast(a, b) {
+  return dylib.symbols.add_f32(a, b);
+};
+testOptimized(addF32Fast, () => addF32Fast(123.123, 456.789));
+
+function addF64Fast(a, b) {
+  return dylib.symbols.add_f64(a, b);
+};
+testOptimized(addF64Fast, () => addF64Fast(123.123, 456.789));
+
 // Test adders as nonblocking calls
 console.log(await dylib.symbols.add_i32_nonblocking(123, 456));
 console.log(await dylib.symbols.add_u64_nonblocking(0xffffffffn, 0xffffffffn));
@@ -437,6 +454,39 @@ call_stored_function();
 dylib.symbols.store_function_2(add10Callback.pointer);
 dylib.symbols.call_stored_function_2(20);
 
+function logManyParametersFast(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) {
+  return symbols.log_many_parameters(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s);
+};
+testOptimized(
+  logManyParametersFast,
+  () => logManyParametersFast(
+    255, 65535, 4294967295, 4294967296, 123.456, 789.876, -1, -2, -3, -4, -1000, 1000,
+    12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910, 12345.678910
+  )
+);
+
+// Some ABIs rely on the convention to zero/sign-extend arguments by the caller to optimize the callee function.
+// If the trampoline did not zero/sign-extend arguments, this would return 256 instead of the expected 0 (in optimized builds)
+function castU8U32Fast(x) { return symbols.cast_u8_u32(x); };
+testOptimized(castU8U32Fast, () => castU8U32Fast(256));
+
+// Some ABIs rely on the convention to expect garbage in the bits beyond the size of the return value to optimize the callee function.
+// If the trampoline did not zero/sign-extend the return value, this would return 256 instead of the expected 0 (in optimized builds)
+function castU32U8Fast(x) { return symbols.cast_u32_u8(x); };
+testOptimized(castU32U8Fast, () => castU32U8Fast(256));
+
+// Generally the trampoline tail-calls into the FFI function, but in certain cases (e.g. when returning 8 or 16 bit integers)
+// the tail call is not possible and a new stack frame must be created. We need enough parameters to have some on the stack
+function addManyU16Fast(a, b, c, d, e, f, g, h, i, j, k, l, m) { 
+  return symbols.add_many_u16(a, b, c, d, e, f, g, h, i, j, k, l, m);
+};
+// N.B. V8 does not currently follow Aarch64 Apple's calling convention.
+// The current implementation of the JIT trampoline follows the V8 incorrect calling convention. This test covers the use-case
+// and is expected to fail once Deno uses a V8 version with the bug fixed.
+// The V8 bug is being tracked in https://bugs.chromium.org/p/v8/issues/detail?id=13171
+testOptimized(addManyU16Fast, () => addManyU16Fast(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12));
+
+
 const nestedCallback = new Deno.UnsafeCallback(
   { parameters: [], result: "void" },
   () => {
@@ -502,6 +552,12 @@ try {
   console.log("Invalid UTF-8 characters to `v8::String`:", charView.getCString());
 }
 
+
+const bytes = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+function hash() { return dylib.symbols.hash(bytes, bytes.byteLength); };
+
+testOptimized(hash, () => hash());
+
 (function cleanup() {
   dylib.close();
   throwCallback.close();
@@ -526,4 +582,23 @@ After: ${postStr}`,
   }
 
   console.log("Correct number of resources");
-})();
\ No newline at end of file
+})();
+
+function assertIsOptimized(fn) {
+  const status = % GetOptimizationStatus(fn);
+  assert(status & (1 << 4), `expected ${fn.name} to be optimized, but wasn't`);
+}
+
+function testOptimized(fn, callback) {
+  %PrepareFunctionForOptimization(fn);
+  const r1 = callback();
+  if (r1 !== undefined) {
+    console.log(r1);
+  }
+  %OptimizeFunctionOnNextCall(fn);
+  const r2 = callback();
+  if (r2 !== undefined) {
+    console.log(r2);
+  }
+  assertIsOptimized(fn);
+}
\ No newline at end of file
diff --git a/test_util/Cargo.toml b/test_util/Cargo.toml
index 7fa309dac4..ce5ad244d4 100644
--- a/test_util/Cargo.toml
+++ b/test_util/Cargo.toml
@@ -40,4 +40,4 @@ tokio-tungstenite = "0.16"
 pty = "0.2.2"
 
 [target.'cfg(windows)'.dependencies]
-winapi = { version = "0.3.9", features = ["consoleapi", "handleapi", "namedpipeapi", "winbase", "winerror"] }
+winapi = { version = "0.3.9", features = ["consoleapi", "synchapi", "handleapi", "namedpipeapi", "winbase", "winerror"] }