From 44aa0ac1c4dac5d5510c82ed4f30d7efc231847e Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Mon, 11 May 2026 16:28:49 +0200 Subject: [PATCH 1/2] perf(aarch64): 32 MiB-aligned mmap + MADV_HUGEPAGE + adaptive pre-touch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports the iter-8 + iter-19 wins from the leanMultisig M2 Asahi experiment (commits 22fe0f88 and b342fa36) into upstream zk-alloc. iter 8 over-allocates the slab region by 32 MiB, rounds REGION_BASE up to a 32 MiB hugepage boundary, sets MADV_HUGEPAGE, and writes one byte per THP page across each slab during REGION_INIT.call_once. With the alignment + hint, each touch fault is satisfied with a 32 MiB THP synchronously, making the THP win deterministic instead of khugepaged-async-dependent. iter 7 saw the same signal but with p=0.019; iter 8 stabilises it. On M2 Asahi the net win is roughly -2.5% on warm prove time. iter 19 makes the pre-touch budget runtime-adaptive: pretouch_bytes = (MemTotal / max_threads / 3).clamp(THP_SIZE, 1 GiB) A hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch overshoots the 16 GiB target M-series Macs (eval-gate prove_loop_cand was OOM-killed twice on the Asahi M2 box on 2026-05-11 with anon-rss ~14.3 GiB). The adaptive formula caps total pre-touch at MemTotal/3, leaving the workload's own ~10 GiB touched footprint and the rest of the process headroom. On a 64 GiB Hetzner box the formula tops out at the 1 GiB ceiling, preserving iter 8's exact behaviour there. MemTotal is sourced via an allocation-free fallback: `syscall::total_ram_bytes()` returns 0 from the libc fallback arm (current aarch64-Linux path in this base; the real sysinfo-syscall implementation will live in #11's raw-syscall arm after merge). When it returns 0 the formula falls back to THP_SIZE per slab — conservative but safe (no OOM, but loses most of iter 8's THP-coverage benefit until #11 + this rollup are both on main). All changes are cfg-gated to target_arch="aarch64"; x86_64 keeps the existing MADV_NOHUGEPAGE hint and the unmodified region layout. Local cargo fmt / clippy / test --workspace pass on x86_64 Hetzner Zen 4. Pairing rule: iter 8 must not ship without iter 19. iter 8 alone OOMs 16 GiB Macs (Justin's deployment target). This commit ships both. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lib.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- src/syscall.rs | 13 +++++++++ 2 files changed, 82 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2804e8a..84a56c2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -159,16 +159,81 @@ fn ensure_region() -> usize { let max_threads = cpus + SLACK; let region_size = slab_size * max_threads; + // On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. Over-allocate by + // THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary — required + // for khugepaged to collapse base pages into hugepages. Without alignment + // + an eager touch (one write per 32 MiB) the kernel collapses the touched + // region into THP synchronously instead of relying on async khugepaged. + #[cfg(target_arch = "aarch64")] + const THP_SIZE: usize = 32 << 20; + + #[cfg(target_arch = "aarch64")] + let mmap_size = region_size + THP_SIZE; + #[cfg(not(target_arch = "aarch64"))] + let mmap_size = region_size; // SAFETY: mmap_anonymous returns a page-aligned pointer or null. // MAP_NORESERVE means no physical memory is committed until pages are touched. - let ptr = unsafe { syscall::mmap_anonymous(region_size) }; - if ptr.is_null() { + let raw = unsafe { syscall::mmap_anonymous(mmap_size) }; + if raw.is_null() { std::process::abort(); } - unsafe { syscall::madvise(ptr, region_size, syscall::MADV_NOHUGEPAGE) }; + + #[cfg(target_arch = "aarch64")] + let aligned_base = (raw as usize).next_multiple_of(THP_SIZE); + #[cfg(not(target_arch = "aarch64"))] + let aligned_base = raw as usize; + + // On aarch64, ask khugepaged to use THP for the slab region. On x86_64 + // preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab + // release; documented original choice). + #[cfg(target_arch = "aarch64")] + let advice = syscall::MADV_HUGEPAGE; + #[cfg(not(target_arch = "aarch64"))] + let advice = syscall::MADV_NOHUGEPAGE; + unsafe { syscall::madvise(aligned_base as *mut u8, region_size, advice) }; + + // Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across + // the first `pretouch_bytes` of every per-thread slab. Each write triggers + // a page fault that the kernel resolves into a 32 MiB THP given our + // MADV_HUGEPAGE hint and the 32 MiB-aligned base. Makes the THP win + // deterministic instead of khugepaged-async-dependent. + // + // Adapt `pretouch_bytes` to MemTotal so total pre-touch stays under + // MemTotal / OVERCOMMIT_GUARD (= 1/3 of RAM): on a 16 GiB Asahi M2 box, + // a hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch over-commits and gets + // OOM-killed. Formula gives ~390 MiB per slab at 16 GiB, ~1 GiB at 64 GiB. + // Floor at THP_SIZE so we still pre-touch at least one hugepage if + // `total_ram_bytes()` returns 0 (stub or syscall failure). + #[cfg(target_arch = "aarch64")] + { + const PRETOUCH_HARD_CAP: usize = 1 << 30; + const OVERCOMMIT_GUARD: usize = 3; + // SAFETY: total_ram_bytes is allocation-free on platforms with a real + // impl, and the libc-fallback stub returns 0 without allocating. + let mem_total = unsafe { syscall::total_ram_bytes() }; + let pretouch_bytes = if mem_total == 0 { + THP_SIZE + } else { + let budget = mem_total / max_threads / OVERCOMMIT_GUARD; + budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP) + }; + for slab_idx in 0..max_threads { + let slab_base = aligned_base + slab_idx * slab_size; + let mut off = 0; + while off < pretouch_bytes { + // SAFETY: aligned_base..aligned_base+region_size is a valid + // anonymous mmap reservation; we only touch within slab. + unsafe { + std::ptr::write_volatile((slab_base + off) as *mut u8, 0); + } + off += THP_SIZE; + } + } + } + MAX_THREADS.store(max_threads, Ordering::Release); REGION_SIZE.store(region_size, Ordering::Release); - REGION_BASE.store(ptr as usize, Ordering::Release); + REGION_BASE.store(aligned_base, Ordering::Release); }); REGION_BASE.load(Ordering::Acquire) } diff --git a/src/syscall.rs b/src/syscall.rs index f676b2a..a873517 100644 --- a/src/syscall.rs +++ b/src/syscall.rs @@ -96,6 +96,7 @@ mod imp { mod imp { use std::ptr; + pub const MADV_HUGEPAGE: usize = 14; pub const MADV_NOHUGEPAGE: usize = 15; #[inline] @@ -117,6 +118,18 @@ mod imp { pub unsafe fn madvise(_ptr: *mut u8, _size: usize, _advice: usize) { // The advice values we pass are Linux-specific. } + + /// Conservative stub: returns 0 to signal "unknown". Real allocation-free + /// implementations (sysinfo syscall on aarch64-Linux, sysctl on macOS) live + /// behind their own raw-syscall imp blocks. With 0, the lib.rs adaptive + /// pre-touch falls back to a single hugepage per slab — safe (no OOM) + /// but loses the full THP-coverage win of iter 8. + #[inline] + pub unsafe fn total_ram_bytes() -> usize { + 0 + } } pub use imp::{madvise, mmap_anonymous, MADV_NOHUGEPAGE}; +#[cfg(target_arch = "aarch64")] +pub use imp::{total_ram_bytes, MADV_HUGEPAGE}; From 39681203c600565618653aa0b46bb3f913228e16 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Mon, 11 May 2026 16:29:17 +0200 Subject: [PATCH 2/2] perf(aarch64): lower DEFAULT_MIN_ARENA_BYTES from 4096 to 256 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports leanMultisig iter-10 (commit b211697d). With iter 8's 32 MiB-THP arena landed in the previous commit, the 4096-byte size-routing threshold leaves sub-page allocs in System where they hit base-page TLB entries (16 KiB on M2 Asahi). Lowering to 256 routes the 256..4095 band into the THP-backed arena, buying the hugepage TLB benefit for that mass — the original zk-alloc profile attributed ~1.30% of cycles to glibc helpers servicing that band on M2. Phase-crossing safety: ~1.5 KB Injector blocks now land in the arena. The rayon-flush feature (default-on, src/lib.rs:225) drains the rayon injector inside end_phase() before the next begin_phase() recycles the slab, preventing the corruption case the original 4096 default guarded against. Sticky-System realloc still protects Vecs grown across phases. Kept cfg-gated to target_arch="aarch64": the iter-8 rationale (hugepage TLB) doesn't apply on x86 where the historical NOHUGEPAGE hint stands and the 4096 default is a documented phase-crossing-safety choice. Users can still override either path via ZK_ALLOC_MIN_BYTES. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lib.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 84a56c2..03897a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,11 +117,22 @@ static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0); /// the arena, so library state that outlives a phase doesn't land in /// recycled memory. /// -/// Defaults to 4096 (one page) — covers the known phase-crossing patterns: -/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry -/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job -/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override +/// Defaults to 4096 (one page) on x86_64 — covers the known phase-crossing +/// patterns: crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber +/// Registry slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core +/// job stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override /// to a different threshold. +/// +/// On aarch64 the default drops to 256: with the iter-8 32 MiB-THP-backed +/// arena, allocs in the arena hit a hugepage TLB entry while System allocs +/// land on 16 KiB base pages, so routing the 256..4095 size band into the +/// arena buys the hugepage TLB benefit for those allocations (-1.30% of +/// cycles in glibc helpers observed on the original zk-alloc M2 profile). +/// The rayon-flush feature (default-on) keeps Injector blocks safe across +/// phase boundaries; sticky-System realloc protects grown Vecs. +#[cfg(target_arch = "aarch64")] +const DEFAULT_MIN_ARENA_BYTES: usize = 256; +#[cfg(not(target_arch = "aarch64"))] const DEFAULT_MIN_ARENA_BYTES: usize = 4096; static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES);