From 199f91091da112b7fdaddba25a135d232abf4f6a Mon Sep 17 00:00:00 2001 From: Sharif Date: Sat, 6 Jun 2026 11:03:56 -0600 Subject: [PATCH] jit/a64: avoid redundant icache flush on macOS after generateProgram On macOS with MAP_JIT (macOS 11+, all Apple Silicon), setPagesRX toggles write-protect via pthread_jit_write_protect_np(1) and then flushes the entire JIT code buffer with __builtin___clear_cache. JitCompilerA64::enableExecution is called once per hash iteration, immediately after generateProgram (or generateProgramLight / generateSuperscalarHash), each of which already calls __builtin___clear_cache over the exact modified range. The subsequent full-buffer flush in setPagesRX is therefore redundant. On Apple Silicon, sys_icache_invalidate serialises the instruction pipeline. Profiling on M4 Pro shows it consuming ~3% of CPU samples, rooted specifically in the setPagesRX call inside enableExecution. Fix: add setPagesRXKeepIcache, which on the macOS 11+ code path only calls pthread_jit_write_protect_np(1) without __builtin___clear_cache. On all other platforms the function falls through to the existing pageProtect path, so behaviour is unchanged outside macOS. JitCompilerA64::enableExecution switches to setPagesRXKeepIcache. No other JIT backends are affected (x86 generateProgram does not call __builtin___clear_cache, so its flush in enableExecution is not redundant and is left as-is). Note: jit_compiler_rv64.cpp::enableExecution has the same redundant double-flush pattern on platforms where MAP_JIT applies. That backend is out of scope for this patch. Verified on Apple M4 Pro (macOS 26.5.1, Apple clang 21.0.0, -march=armv8-a+crypto -mcpu=apple-m4): Profiler: sys_icache_invalidate via setPagesRX eliminated (~3% of CPU samples removed from this code path) Benchmark: directionally +1-3% H/s across multiple trial sets; performance variance reduced (stdev 29 vs 58 H/s) Hash output: identical to unpatched binary for identical nonce sequences --- src/jit_compiler_a64.cpp | 2 +- src/virtual_memory.c | 16 ++++++++++++++++ src/virtual_memory.h | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index d2a1d6c3..de4120d4 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -116,7 +116,7 @@ void JitCompilerA64::enableWriting() void JitCompilerA64::enableExecution() { - setPagesRX(code, CodeSize + CalcDatasetItemSize); + setPagesRXKeepIcache(code, CodeSize + CalcDatasetItemSize); } void JitCompilerA64::enableAll() diff --git a/src/virtual_memory.c b/src/virtual_memory.c index 1000d99f..cdeddfc3 100644 --- a/src/virtual_memory.c +++ b/src/virtual_memory.c @@ -198,6 +198,22 @@ void setPagesRX(void* ptr, size_t bytes) { #endif } +void setPagesRXKeepIcache(void* ptr, size_t bytes) { +#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ + && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 + if (__builtin_available(macOS 11.0, *)) { + /* The A64 JIT already called __builtin___clear_cache on the exact + modified range inside generateProgram. Only toggle write-protect + here; re-flushing the full buffer is redundant and measurably + expensive on Apple Silicon. */ + pthread_jit_write_protect_np(1); + return; + } +#endif + char *errfunc; + pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc); +} + void setPagesRWX(void* ptr, size_t bytes) { char *errfunc; pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE, &errfunc); diff --git a/src/virtual_memory.h b/src/virtual_memory.h index 5e8e31d5..063edb05 100644 --- a/src/virtual_memory.h +++ b/src/virtual_memory.h @@ -39,6 +39,7 @@ extern "C" { void* allocMemoryPages(size_t); void setPagesRW(void*, size_t); void setPagesRX(void*, size_t); +void setPagesRXKeepIcache(void*, size_t); void setPagesRWX(void*, size_t); void* allocLargePagesMemory(size_t); void freePagedMemory(void*, size_t);