From 44aa0ac1c4dac5d5510c82ed4f30d7efc231847e Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Mon, 11 May 2026 16:28:49 +0200
Subject: [PATCH 1/2] perf(aarch64): 32 MiB-aligned mmap + MADV_HUGEPAGE +
 adaptive pre-touch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ports the iter-8 + iter-19 wins from the leanMultisig M2 Asahi experiment
(commits 22fe0f88 and b342fa36) into upstream zk-alloc.

iter 8 over-allocates the slab region by 32 MiB, rounds REGION_BASE up to a
32 MiB hugepage boundary, sets MADV_HUGEPAGE, and writes one byte per THP
page across each slab during REGION_INIT.call_once. With the alignment +
hint, each touch fault is satisfied with a 32 MiB THP synchronously, making
the THP win deterministic instead of khugepaged-async-dependent. iter 7
saw the same signal but with p=0.019; iter 8 stabilises it. On M2 Asahi
the net win is roughly -2.5% on warm prove time.

iter 19 makes the pre-touch budget runtime-adaptive:

    pretouch_bytes = (MemTotal / max_threads / 3).clamp(THP_SIZE, 1 GiB)

A hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch overshoots the 16 GiB
target M-series Macs (eval-gate prove_loop_cand was OOM-killed twice on
the Asahi M2 box on 2026-05-11 with anon-rss ~14.3 GiB). The adaptive
formula caps total pre-touch at MemTotal/3, leaving the workload's own
~10 GiB touched footprint and the rest of the process headroom. On a
64 GiB Hetzner box the formula tops out at the 1 GiB ceiling, preserving
iter 8's exact behaviour there.

MemTotal is sourced via an allocation-free fallback: `syscall::total_ram_bytes()`
returns 0 from the libc fallback arm (current aarch64-Linux path in this
base; the real sysinfo-syscall implementation will live in #11's raw-syscall
arm after merge). When it returns 0 the formula falls back to THP_SIZE
per slab — conservative but safe (no OOM, but loses most of iter 8's
THP-coverage benefit until #11 + this rollup are both on main).

All changes are cfg-gated to target_arch="aarch64"; x86_64 keeps the
existing MADV_NOHUGEPAGE hint and the unmodified region layout. Local
cargo fmt / clippy / test --workspace pass on x86_64 Hetzner Zen 4.

Pairing rule: iter 8 must not ship without iter 19. iter 8 alone OOMs
16 GiB Macs (Justin's deployment target). This commit ships both.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lib.rs     | 73 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/syscall.rs | 13 +++++++++
 2 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 2804e8a..84a56c2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -159,16 +159,81 @@ fn ensure_region() -> usize {
         let max_threads = cpus + SLACK;
         let region_size = slab_size * max_threads;
 
+        // On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. Over-allocate by
+        // THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary — required
+        // for khugepaged to collapse base pages into hugepages. Without alignment
+        // + an eager touch (one write per 32 MiB) the kernel collapses the touched
+        // region into THP synchronously instead of relying on async khugepaged.
+        #[cfg(target_arch = "aarch64")]
+        const THP_SIZE: usize = 32 << 20;
+
+        #[cfg(target_arch = "aarch64")]
+        let mmap_size = region_size + THP_SIZE;
+        #[cfg(not(target_arch = "aarch64"))]
+        let mmap_size = region_size;
         // SAFETY: mmap_anonymous returns a page-aligned pointer or null.
         // MAP_NORESERVE means no physical memory is committed until pages are touched.
-        let ptr = unsafe { syscall::mmap_anonymous(region_size) };
-        if ptr.is_null() {
+        let raw = unsafe { syscall::mmap_anonymous(mmap_size) };
+        if raw.is_null() {
             std::process::abort();
         }
-        unsafe { syscall::madvise(ptr, region_size, syscall::MADV_NOHUGEPAGE) };
+
+        #[cfg(target_arch = "aarch64")]
+        let aligned_base = (raw as usize).next_multiple_of(THP_SIZE);
+        #[cfg(not(target_arch = "aarch64"))]
+        let aligned_base = raw as usize;
+
+        // On aarch64, ask khugepaged to use THP for the slab region. On x86_64
+        // preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab
+        // release; documented original choice).
+        #[cfg(target_arch = "aarch64")]
+        let advice = syscall::MADV_HUGEPAGE;
+        #[cfg(not(target_arch = "aarch64"))]
+        let advice = syscall::MADV_NOHUGEPAGE;
+        unsafe { syscall::madvise(aligned_base as *mut u8, region_size, advice) };
+
+        // Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across
+        // the first `pretouch_bytes` of every per-thread slab. Each write triggers
+        // a page fault that the kernel resolves into a 32 MiB THP given our
+        // MADV_HUGEPAGE hint and the 32 MiB-aligned base. Makes the THP win
+        // deterministic instead of khugepaged-async-dependent.
+        //
+        // Adapt `pretouch_bytes` to MemTotal so total pre-touch stays under
+        // MemTotal / OVERCOMMIT_GUARD (= 1/3 of RAM): on a 16 GiB Asahi M2 box,
+        // a hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch over-commits and gets
+        // OOM-killed. Formula gives ~390 MiB per slab at 16 GiB, ~1 GiB at 64 GiB.
+        // Floor at THP_SIZE so we still pre-touch at least one hugepage if
+        // `total_ram_bytes()` returns 0 (stub or syscall failure).
+        #[cfg(target_arch = "aarch64")]
+        {
+            const PRETOUCH_HARD_CAP: usize = 1 << 30;
+            const OVERCOMMIT_GUARD: usize = 3;
+            // SAFETY: total_ram_bytes is allocation-free on platforms with a real
+            // impl, and the libc-fallback stub returns 0 without allocating.
+            let mem_total = unsafe { syscall::total_ram_bytes() };
+            let pretouch_bytes = if mem_total == 0 {
+                THP_SIZE
+            } else {
+                let budget = mem_total / max_threads / OVERCOMMIT_GUARD;
+                budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP)
+            };
+            for slab_idx in 0..max_threads {
+                let slab_base = aligned_base + slab_idx * slab_size;
+                let mut off = 0;
+                while off < pretouch_bytes {
+                    // SAFETY: aligned_base..aligned_base+region_size is a valid
+                    // anonymous mmap reservation; we only touch within slab.
+                    unsafe {
+                        std::ptr::write_volatile((slab_base + off) as *mut u8, 0);
+                    }
+                    off += THP_SIZE;
+                }
+            }
+        }
+
         MAX_THREADS.store(max_threads, Ordering::Release);
         REGION_SIZE.store(region_size, Ordering::Release);
-        REGION_BASE.store(ptr as usize, Ordering::Release);
+        REGION_BASE.store(aligned_base, Ordering::Release);
     });
     REGION_BASE.load(Ordering::Acquire)
 }
diff --git a/src/syscall.rs b/src/syscall.rs
index f676b2a..a873517 100644
--- a/src/syscall.rs
+++ b/src/syscall.rs
@@ -96,6 +96,7 @@ mod imp {
 mod imp {
     use std::ptr;
 
+    pub const MADV_HUGEPAGE: usize = 14;
     pub const MADV_NOHUGEPAGE: usize = 15;
 
     #[inline]
@@ -117,6 +118,18 @@ mod imp {
     pub unsafe fn madvise(_ptr: *mut u8, _size: usize, _advice: usize) {
         // The advice values we pass are Linux-specific.
     }
+
+    /// Conservative stub: returns 0 to signal "unknown". Real allocation-free
+    /// implementations (sysinfo syscall on aarch64-Linux, sysctl on macOS) live
+    /// behind their own raw-syscall imp blocks. With 0, the lib.rs adaptive
+    /// pre-touch falls back to a single hugepage per slab — safe (no OOM)
+    /// but loses the full THP-coverage win of iter 8.
+    #[inline]
+    pub unsafe fn total_ram_bytes() -> usize {
+        0
+    }
 }
 
 pub use imp::{madvise, mmap_anonymous, MADV_NOHUGEPAGE};
+#[cfg(target_arch = "aarch64")]
+pub use imp::{total_ram_bytes, MADV_HUGEPAGE};

From 39681203c600565618653aa0b46bb3f913228e16 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Mon, 11 May 2026 16:29:17 +0200
Subject: [PATCH 2/2] perf(aarch64): lower DEFAULT_MIN_ARENA_BYTES from 4096 to
 256
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ports leanMultisig iter-10 (commit b211697d). With iter 8's 32 MiB-THP
arena landed in the previous commit, the 4096-byte size-routing threshold
leaves sub-page allocs in System where they hit base-page TLB entries
(16 KiB on M2 Asahi). Lowering to 256 routes the 256..4095 band into
the THP-backed arena, buying the hugepage TLB benefit for that mass —
the original zk-alloc profile attributed ~1.30% of cycles to glibc
helpers servicing that band on M2.

Phase-crossing safety: ~1.5 KB Injector blocks now land in the arena.
The rayon-flush feature (default-on, src/lib.rs:225) drains the rayon
injector inside end_phase() before the next begin_phase() recycles the
slab, preventing the corruption case the original 4096 default guarded
against. Sticky-System realloc still protects Vecs grown across phases.

Kept cfg-gated to target_arch="aarch64": the iter-8 rationale (hugepage
TLB) doesn't apply on x86 where the historical NOHUGEPAGE hint stands
and the 4096 default is a documented phase-crossing-safety choice. Users
can still override either path via ZK_ALLOC_MIN_BYTES.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lib.rs | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 84a56c2..03897a9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -117,11 +117,22 @@ static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0);
 /// the arena, so library state that outlives a phase doesn't land in
 /// recycled memory.
 ///
-/// Defaults to 4096 (one page) — covers the known phase-crossing patterns:
-/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry
-/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job
-/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override
+/// Defaults to 4096 (one page) on x86_64 — covers the known phase-crossing
+/// patterns: crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber
+/// Registry slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core
+/// job stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override
 /// to a different threshold.
+///
+/// On aarch64 the default drops to 256: with the iter-8 32 MiB-THP-backed
+/// arena, allocs in the arena hit a hugepage TLB entry while System allocs
+/// land on 16 KiB base pages, so routing the 256..4095 size band into the
+/// arena buys the hugepage TLB benefit for those allocations (-1.30% of
+/// cycles in glibc helpers observed on the original zk-alloc M2 profile).
+/// The rayon-flush feature (default-on) keeps Injector blocks safe across
+/// phase boundaries; sticky-System realloc protects grown Vecs.
+#[cfg(target_arch = "aarch64")]
+const DEFAULT_MIN_ARENA_BYTES: usize = 256;
+#[cfg(not(target_arch = "aarch64"))]
 const DEFAULT_MIN_ARENA_BYTES: usize = 4096;
 static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES);