Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 84 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,22 @@ static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0);
/// the arena, so library state that outlives a phase doesn't land in
/// recycled memory.
///
/// Defaults to 4096 (one page) — covers the known phase-crossing patterns:
/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry
/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job
/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override
/// Defaults to 4096 (one page) on x86_64 — covers the known phase-crossing
/// patterns: crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber
/// Registry slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core
/// job stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override
/// to a different threshold.
///
/// On aarch64 the default drops to 256: with the iter-8 32 MiB-THP-backed
/// arena, allocs in the arena hit a hugepage TLB entry while System allocs
/// land on 16 KiB base pages, so routing the 256..4095 size band into the
/// arena buys the hugepage TLB benefit for those allocations (-1.30% of
/// cycles in glibc helpers observed on the original zk-alloc M2 profile).
/// The rayon-flush feature (default-on) keeps Injector blocks safe across
/// phase boundaries; sticky-System realloc protects grown Vecs.
#[cfg(target_arch = "aarch64")]
const DEFAULT_MIN_ARENA_BYTES: usize = 256;
#[cfg(not(target_arch = "aarch64"))]
const DEFAULT_MIN_ARENA_BYTES: usize = 4096;
static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES);

Expand Down Expand Up @@ -159,16 +170,81 @@ fn ensure_region() -> usize {
let max_threads = cpus + SLACK;
let region_size = slab_size * max_threads;

// On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. Over-allocate by
// THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary — required
// for khugepaged to collapse base pages into hugepages. Without alignment
// + an eager touch (one write per 32 MiB) the kernel collapses the touched
// region into THP synchronously instead of relying on async khugepaged.
#[cfg(target_arch = "aarch64")]
const THP_SIZE: usize = 32 << 20;

#[cfg(target_arch = "aarch64")]
let mmap_size = region_size + THP_SIZE;
#[cfg(not(target_arch = "aarch64"))]
let mmap_size = region_size;
// SAFETY: mmap_anonymous returns a page-aligned pointer or null.
// MAP_NORESERVE means no physical memory is committed until pages are touched.
let ptr = unsafe { syscall::mmap_anonymous(region_size) };
if ptr.is_null() {
let raw = unsafe { syscall::mmap_anonymous(mmap_size) };
if raw.is_null() {
std::process::abort();
}
unsafe { syscall::madvise(ptr, region_size, syscall::MADV_NOHUGEPAGE) };

#[cfg(target_arch = "aarch64")]
let aligned_base = (raw as usize).next_multiple_of(THP_SIZE);
#[cfg(not(target_arch = "aarch64"))]
let aligned_base = raw as usize;

// On aarch64, ask khugepaged to use THP for the slab region. On x86_64
// preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab
// release; documented original choice).
#[cfg(target_arch = "aarch64")]
let advice = syscall::MADV_HUGEPAGE;
#[cfg(not(target_arch = "aarch64"))]
let advice = syscall::MADV_NOHUGEPAGE;
unsafe { syscall::madvise(aligned_base as *mut u8, region_size, advice) };

// Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across
// the first `pretouch_bytes` of every per-thread slab. Each write triggers
// a page fault that the kernel resolves into a 32 MiB THP given our
// MADV_HUGEPAGE hint and the 32 MiB-aligned base. Makes the THP win
// deterministic instead of khugepaged-async-dependent.
//
// Adapt `pretouch_bytes` to MemTotal so total pre-touch stays under
// MemTotal / OVERCOMMIT_GUARD (= 1/3 of RAM): on a 16 GiB Asahi M2 box,
// a hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch over-commits and gets
// OOM-killed. Formula gives ~390 MiB per slab at 16 GiB, ~1 GiB at 64 GiB.
// Floor at THP_SIZE so we still pre-touch at least one hugepage if
// `total_ram_bytes()` returns 0 (stub or syscall failure).
#[cfg(target_arch = "aarch64")]
{
const PRETOUCH_HARD_CAP: usize = 1 << 30;
const OVERCOMMIT_GUARD: usize = 3;
// SAFETY: total_ram_bytes is allocation-free on platforms with a real
// impl, and the libc-fallback stub returns 0 without allocating.
let mem_total = unsafe { syscall::total_ram_bytes() };
let pretouch_bytes = if mem_total == 0 {
THP_SIZE
} else {
let budget = mem_total / max_threads / OVERCOMMIT_GUARD;
budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP)
};
for slab_idx in 0..max_threads {
let slab_base = aligned_base + slab_idx * slab_size;
let mut off = 0;
while off < pretouch_bytes {
// SAFETY: aligned_base..aligned_base+region_size is a valid
// anonymous mmap reservation; we only touch within slab.
unsafe {
std::ptr::write_volatile((slab_base + off) as *mut u8, 0);
}
off += THP_SIZE;
}
}
}

MAX_THREADS.store(max_threads, Ordering::Release);
REGION_SIZE.store(region_size, Ordering::Release);
REGION_BASE.store(ptr as usize, Ordering::Release);
REGION_BASE.store(aligned_base, Ordering::Release);
});
REGION_BASE.load(Ordering::Acquire)
}
Expand Down
13 changes: 13 additions & 0 deletions src/syscall.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ mod imp {
mod imp {
use std::ptr;

pub const MADV_HUGEPAGE: usize = 14;
pub const MADV_NOHUGEPAGE: usize = 15;

#[inline]
Expand All @@ -117,6 +118,18 @@ mod imp {
pub unsafe fn madvise(_ptr: *mut u8, _size: usize, _advice: usize) {
// The advice values we pass are Linux-specific.
}

/// Conservative stub: returns 0 to signal "unknown". Real allocation-free
/// implementations (sysinfo syscall on aarch64-Linux, sysctl on macOS) live
/// behind their own raw-syscall imp blocks. With 0, the lib.rs adaptive
/// pre-touch falls back to a single hugepage per slab — safe (no OOM)
/// but loses the full THP-coverage win of iter 8.
#[inline]
pub unsafe fn total_ram_bytes() -> usize {
0
}
}

pub use imp::{madvise, mmap_anonymous, MADV_NOHUGEPAGE};
#[cfg(target_arch = "aarch64")]
pub use imp::{total_ram_bytes, MADV_HUGEPAGE};