diff --git a/host/src/dylink.ts b/host/src/dylink.ts index f159d2b8e..9ca6d2784 100644 --- a/host/src/dylink.ts +++ b/host/src/dylink.ts @@ -345,6 +345,28 @@ function instantiateSharedLibrary( ? new (WebAssembly as any).Tag({ parameters: ["i32"] }) : undefined; + // C++ side modules built with -fwasm-exceptions import an env.__cpp_exception + // tag. It is module-local, so an exception thrown here can only be caught + // within this module — fine while side modules don't let C++ exceptions + // escape across the boundary; cross-module EH would need a process-shared tag. + const cppExceptionTag = (typeof (WebAssembly as any).Tag === "function") + ? new (WebAssembly as any).Tag({ parameters: ["i32"] }) + : undefined; + + // A self-contained C++ side module both defines and imports its weak COMDAT + // symbols (virtual dtors, template instantiations, operator new/delete): + // wasm-ld routes default-visibility weak symbols through env so a main module + // could interpose them, but a pure-C main module exports none of them. Inspect + // the declared imports up front so the env proxy can satisfy such a symbol + // from the module's own exports instead of failing instantiation. + const module = new WebAssembly.Module(wasmBytes as unknown as BufferSource); + const envFunctionImports = new Set( + WebAssembly.Module.imports(module) + .filter((imp) => imp.module === "env" && imp.kind === "function") + .map((imp) => imp.name), + ); + let selfExports: WebAssembly.Exports | undefined; + // Construct imports const imports: WebAssembly.Imports = { env: new Proxy({} as Record, { @@ -356,15 +378,34 @@ function instantiateSharedLibrary( case "__table_base": return tableBaseGlobal; case "__stack_pointer": return options.stackPointer; case "__c_longjmp": return longjmpTag; + case "__cpp_exception": return cppExceptionTag; } const sym = options.globalSymbols.get(prop); if (sym !== undefined) return sym; + // Weak self-import: route to the module's own exports, which exist only + // after instantiation. A genuinely missing symbol throws at call time + // rather than silently returning 0, keeping a real ABI gap truthful. + // Data imports resolve via GOT.mem, so only function imports land here. + if (envFunctionImports.has(prop)) { + return (...args: unknown[]) => { + const fn = selfExports?.[prop]; + if (typeof fn !== "function") { + throw new Error( + `${name}: import env.${prop} is not provided by the main ` + + `module or the side module's own exports`, + ); + } + return (fn as Function)(...args); + }; + } return undefined; }, has(_target, prop: string) { if (["memory", "__indirect_function_table", "__memory_base", - "__table_base", "__stack_pointer", "__c_longjmp"].includes(prop)) return true; - return options.globalSymbols.has(prop); + "__table_base", "__stack_pointer", "__c_longjmp", + "__cpp_exception"].includes(prop)) return true; + if (options.globalSymbols.has(prop)) return true; + return envFunctionImports.has(prop); }, }), "GOT.mem": new Proxy({} as Record, { @@ -379,9 +420,11 @@ function instantiateSharedLibrary( }), }; - // Compile and instantiate synchronously - const module = new WebAssembly.Module(wasmBytes as unknown as BufferSource); + // Capture exports immediately so the weak-self-import trampolines can reach + // the module's own functions. Instantiation never fires them: side modules + // have no start function and defer ctors to the __wasm_call_ctors call below. const instance = new WebAssembly.Instance(module, imports); + selfExports = instance.exports; // Relocate exports: data address globals need memoryBase added const relocatedExports: Record = {}; diff --git a/host/test/dylink.test.ts b/host/test/dylink.test.ts index c9ad7bc05..bf087f61c 100644 --- a/host/test/dylink.test.ts +++ b/host/test/dylink.test.ts @@ -375,3 +375,91 @@ describe.skipIf(!hasCompiler())("DynamicLinker", () => { expect(h1).toBe(h2); }); }); + +function hasWat2Wasm(): boolean { + try { + execFileSync("wat2wasm", ["--version"], { stdio: "ignore" }); + return true; + } catch { + return false; + } +} + +// A minimal dylink.0 section (MEM_INFO: mem/table size + align all 0), which +// marks a module as a side module. wat2wasm emits custom sections last, but the +// loader requires dylink.0 first, so it is injected as raw bytes below rather +// than declared in the WAT. +const DYLINK_SECTION = new Uint8Array([ + 0x00, 0x0f, // custom section, size 15 + 0x08, 0x64, 0x79, 0x6c, 0x69, 0x6e, 0x6b, 0x2e, 0x30, // name "dylink.0" + 0x01, 0x04, 0x00, 0x00, 0x00, 0x00, // MEM_INFO subsection +]); + +/** + * Assemble a hand-written side module. Using WAT (rather than a compiled C/C++ + * fixture) lets these tests reproduce the exact import shapes a real C++ side + * module produces — a self-defined symbol that is *also* an env import, and an + * env.__cpp_exception tag — which no self-contained C fixture can emit. + */ +function assembleSideModule(wat: string, name: string): Uint8Array { + const dir = join(tmpdir(), "wasm-dylink-test"); + mkdirSync(dir, { recursive: true }); + const watPath = join(dir, `${name}.wat`); + const wasmPath = join(dir, `${name}.wasm`); + writeFileSync(watPath, wat); + execFileSync("wat2wasm", ["--enable-exceptions", watPath, "-o", wasmPath], + { stdio: "pipe" }); + const raw = new Uint8Array(readFileSync(wasmPath)); + const out = new Uint8Array(8 + DYLINK_SECTION.length + (raw.length - 8)); + out.set(raw.subarray(0, 8), 0); // magic + version + out.set(DYLINK_SECTION, 8); // dylink.0 first + out.set(raw.subarray(8), 8 + DYLINK_SECTION.length); + return out; +} + +describe.skipIf(!hasWat2Wasm())("weak self-import handling", () => { + function createLoadOptions(): LoadSharedLibraryOptions { + return { + memory: new WebAssembly.Memory({ initial: 1, maximum: 100, shared: true }), + table: new WebAssembly.Table({ initial: 1, element: "anyfunc" }), + stackPointer: new WebAssembly.Global({ value: "i32", mutable: true }, 65536), + heapPointer: { value: 1024 }, + globalSymbols: new Map(), + got: new Map(), + loadedLibraries: new Map(), + }; + } + + it("routes an unresolved env import to the module's own export", () => { + // `self_fn` is both imported from env and exported: the shape wasm-ld emits + // for an interposable weak C++ symbol the module also defines. + const lib = loadSharedLibrarySync("self-import.so", assembleSideModule(` + (module + (import "env" "self_fn" (func $self_fn (result i32))) + (func (export "self_fn") (result i32) (i32.const 42)) + (func (export "call_self") (result i32) (call $self_fn))) + `, "self-import"), createLoadOptions()); + expect((lib.exports.call_self as Function)()).toBe(42); + }); + + it("throws loudly when a self-import has no defining export", () => { + // A genuinely absent symbol must trap at call time, not silently return 0. + const lib = loadSharedLibrarySync("missing-import.so", assembleSideModule(` + (module + (import "env" "missing_fn" (func $missing (result i32))) + (func (export "call_missing") (result i32) (call $missing))) + `, "missing-import"), createLoadOptions()); + expect(() => (lib.exports.call_missing as Function)()).toThrow(/not provided/); + }); + + it("provides the __cpp_exception tag to -fwasm-exceptions modules", () => { + // C++ side modules import this tag; without the host providing it, the + // module fails to instantiate with "tag import requires a WebAssembly.Tag". + const lib = loadSharedLibrarySync("tag.so", assembleSideModule(` + (module + (import "env" "__cpp_exception" (tag $exc (param i32))) + (func (export "noop"))) + `, "tag"), createLoadOptions()); + expect(lib.exports.noop).toBeTypeOf("function"); + }); +}); diff --git a/packages/registry/icu/build-icu.sh b/packages/registry/icu/build-icu.sh new file mode 100755 index 000000000..ebd224126 --- /dev/null +++ b/packages/registry/icu/build-icu.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# +# Build ICU4C (libicuuc.a, libicui18n.a, libicudata.a stub + icu.dat) for +# wasm32-posix-kernel. +# +# ICU requires a TWO-STAGE build: +# +# Stage 1 (HOST): build ICU natively to produce the data-generation tools +# (genrb, pkgdata, icupkg, genccode, …) and the ICU common +# data. These run on the build machine. +# Stage 2 (CROSS): configure ICU for wasm32 with --with-cross-build pointing +# at the stage-1 build dir. The cross build reuses the host +# tools and host-generated data; it only compiles the C++ +# sources into wasm32 static libraries. +# +# Data is built in `archive` packaging mode, which emits the ICU common data as +# a standalone `icudtl.dat` file (NOT linked into libicudata.a — that +# becomes a stub). We install that file as `share/icu.dat`; PHP's intl side +# module loads it at runtime via udata_setCommonData() (the name `icu.dat` is +# deliberate and is NOT ICU's default-searched name). See +# packages/registry/php/build-php.sh for the intl side. +# +# Honors the dep-resolver build-script contract (see docs/package-management.md). +# When invoked via `cargo xtask build-deps resolve icu`, the resolver sets: +# WASM_POSIX_DEP_OUT_DIR # where to install +# WASM_POSIX_DEP_VERSION # upstream version (e.g. "74.2") +# WASM_POSIX_DEP_SOURCE_URL # tarball URL +# WASM_POSIX_DEP_SOURCE_SHA256 # expected sha256 of the tarball +# WASM_POSIX_DEP_LIBCXX_DIR # resolved libcxx prefix (direct dep) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +# shellcheck source=/dev/null +source "$REPO_ROOT/sdk/activate.sh" + +if ! command -v wasm32posix-cc &>/dev/null; then + echo "ERROR: wasm32posix-cc not found after sourcing sdk/activate.sh." >&2 + exit 1 +fi + +# --- Inputs from resolver, with ad-hoc fallbacks --- +ICU_VERSION="${WASM_POSIX_DEP_VERSION:-${ICU_VERSION:-74.2}}" +ICU_VER_UNDERSCORE="${ICU_VERSION//./_}" # 74.2 -> 74_2 +ICU_MAJOR="${ICU_VERSION%%.*}" # 74.2 -> 74 +INSTALL_DIR="${WASM_POSIX_DEP_OUT_DIR:-$SCRIPT_DIR/icu-install}" +SOURCE_URL="${WASM_POSIX_DEP_SOURCE_URL:-https://github.com/unicode-org/icu/releases/download/release-${ICU_MAJOR}-${ICU_VERSION#*.}/icu4c-${ICU_VER_UNDERSCORE}-src.tgz}" +SOURCE_SHA256="${WASM_POSIX_DEP_SOURCE_SHA256:-}" + +SYSROOT="${WASM_POSIX_SYSROOT:-$REPO_ROOT/sysroot}" +export WASM_POSIX_SYSROOT="$SYSROOT" + +SRC_ROOT="$SCRIPT_DIR/icu-src" # contains icu/ (with source/) +ICU_SRC="$SRC_ROOT/icu/source" +HOST_BUILD="$SCRIPT_DIR/host-build" # stage-1 native build (out-of-tree) + +# --- Resolve libcxx (ICU is C++), symlink into sysroot (mariadb pattern) --- +HOST_TARGET="$(rustc -vV | awk '/^host/ {print $2}')" +resolve_dep() { + (cd "$REPO_ROOT" && cargo run -p xtask --target "$HOST_TARGET" --quiet -- build-deps resolve "$1") +} +LIBCXX_PREFIX="${WASM_POSIX_DEP_LIBCXX_DIR:-}" +if [ -z "$LIBCXX_PREFIX" ]; then + echo "==> Resolving libcxx via cargo xtask build-deps..." + LIBCXX_PREFIX="$(resolve_dep libcxx)" +fi +[ -f "$LIBCXX_PREFIX/lib/libc++.a" ] || { echo "ERROR: libcxx resolve missing libc++.a at $LIBCXX_PREFIX" >&2; exit 1; } +[ -f "$LIBCXX_PREFIX/lib/libc++abi.a" ] || { echo "ERROR: libcxx resolve missing libc++abi.a at $LIBCXX_PREFIX" >&2; exit 1; } +[ -d "$LIBCXX_PREFIX/include/c++/v1" ] || { echo "ERROR: libcxx resolve missing include/c++/v1 at $LIBCXX_PREFIX" >&2; exit 1; } + +echo "==> Linking libcxx into sysroot ($LIBCXX_PREFIX)..." +mkdir -p "$SYSROOT/lib" "$SYSROOT/include/c++" +ln -sf "$LIBCXX_PREFIX/lib/libc++.a" "$SYSROOT/lib/libc++.a" +ln -sf "$LIBCXX_PREFIX/lib/libc++abi.a" "$SYSROOT/lib/libc++abi.a" +rm -rf "$SYSROOT/include/c++/v1" +ln -sfn "$LIBCXX_PREFIX/include/c++/v1" "$SYSROOT/include/c++/v1" + +# --- Fetch + verify source --- +if [ ! -d "$ICU_SRC" ]; then + echo "==> Downloading ICU $ICU_VERSION..." + TARBALL="/tmp/icu4c-${ICU_VER_UNDERSCORE}-src.tgz" + curl --retry 10 --retry-delay 5 --retry-max-time 300 --retry-all-errors -fsSL "$SOURCE_URL" -o "$TARBALL" + if [ -n "$SOURCE_SHA256" ]; then + echo "==> Verifying source sha256..." + echo "$SOURCE_SHA256 $TARBALL" | shasum -a 256 -c - + fi + mkdir -p "$SRC_ROOT" + tar xzf "$TARBALL" -C "$SRC_ROOT" # extracts icu/ + rm "$TARBALL" +fi + +NPROC="$(sysctl -n hw.ncpu 2>/dev/null || nproc)" + +# ============================================================ +# Stage 1 — HOST build (native tools + data) +# ============================================================ +# Uses the host compiler (clang/clang++ from the dev shell), NOT the wasm +# wrappers. sdk/activate.sh only prepends SDK bin to PATH; it does not export +# CC/CXX, so an explicit host CC/CXX keeps this stage native. +# +# On Linux, statically fold the GNU C++/GCC runtime into the data tools: the Nix +# CI runner has no libstdc++.so.6 on its loader path, so a dynamically linked +# icupkg/pkgdata (run here by Stage 2's make) aborts at exec with "cannot open +# shared object file". macOS clang links a self-contained libc++ and rejects the +# flags. LDFLAGS set here is honored: runConfigureICU re-exports it to configure. +case "$(uname -s)" in + Linux) HOST_LDFLAGS="-static-libstdc++ -static-libgcc" ;; + *) HOST_LDFLAGS="" ;; +esac +if [ ! -x "$HOST_BUILD/bin/icupkg" ] && [ ! -x "$HOST_BUILD/bin/genccode" ]; then + echo "==> Stage 1: building ICU natively for host tools + data..." + rm -rf "$HOST_BUILD" + mkdir -p "$HOST_BUILD" + ( cd "$HOST_BUILD" + CC="${HOST_CC:-clang}" CXX="${HOST_CXX:-clang++}" \ + LDFLAGS="$HOST_LDFLAGS" \ + "$ICU_SRC/runConfigureICU" MacOSX \ + --enable-static --disable-shared \ + --disable-samples --disable-tests --disable-extras + make -j"$NPROC" + ) +else + echo "==> Stage 1: reusing existing host build at $HOST_BUILD" +fi + +# ============================================================ +# Stage 2 — CROSS build (wasm32 static libs) +# ============================================================ +echo "==> Stage 2: cross-configuring ICU for wasm32..." +# In-tree cross build (wasm32posix-configure runs ./configure in CWD). +# Scrub any prior cross-build state in the source tree. +cd "$ICU_SRC" +make distclean 2>/dev/null || true + +# ICU maps the configure host triple to a config/mh- makefile +# fragment. Our SDK forces --host=wasm32-unknown-none, whose OS component +# ("none") ICU does not recognize, so it selects the stock config/mh-unknown — +# a stub that hard-errors "configure could not detect your platform" and aborts +# `make`. ICU's own remedy (printed in that error) is to supply mh-unknown from +# a known platform. We use mh-linux: this is a --disable-shared --enable-static +# build, so mh-linux's Linux shared-library rules are never exercised; only its +# generic compile rules apply, driven by our wasm CC/CXX. Idempotent overwrite, +# re-applied every run because a fresh source extraction resets it. +cp "$ICU_SRC/config/mh-linux" "$ICU_SRC/config/mh-unknown" + +# C++ flags: ICU 74 needs C++17. libc++ headers come from the sysroot symlink. +# LDFLAGS carries -lc++ -lc++abi so configure's C++ link probes resolve. +# -fPIC: ICU's static libs are absorbed into intl.so, a wasm SIDE MODULE linked +# with `-shared --experimental-pic`. wasm-ld requires EVERY input object to be +# position-independent; a non-PIC ICU object triggers "R_WASM_MEMORY_ADDR_SLEB +# cannot be used against symbol ...; recompile with -fPIC" at the intl.so link. +CXXFLAGS="-O2 -std=c++17 -fPIC" \ +CFLAGS="-O2 -fPIC" \ +LDFLAGS="-lc++ -lc++abi" \ +wasm32posix-configure \ + --with-cross-build="$HOST_BUILD" \ + --enable-static --disable-shared \ + --disable-tools --disable-tests --disable-samples --disable-extras \ + --disable-layoutex \ + --with-data-packaging=archive \ + --prefix="$INSTALL_DIR" + +echo "==> Stage 2: building wasm32 libraries..." +make -j"$NPROC" + +echo "==> Installing to $INSTALL_DIR..." +rm -rf "$INSTALL_DIR" +make install + +# --- Stage the common data as icu.dat (see header) --- +DAT_SRC="$(find "$ICU_SRC/data" "$HOST_BUILD/data" -name "icudt${ICU_MAJOR}l.dat" 2>/dev/null | head -1 || true)" +if [ -z "$DAT_SRC" ]; then + echo "ERROR: could not locate icudt${ICU_MAJOR}l.dat after build" >&2 + exit 1 +fi +mkdir -p "$INSTALL_DIR/share" +cp "$DAT_SRC" "$INSTALL_DIR/share/icu.dat" +echo "==> staged $(basename "$DAT_SRC") -> $INSTALL_DIR/share/icu.dat ($(wc -c < "$INSTALL_DIR/share/icu.dat") bytes)" + +# --- Sanity: the static libs we promise (icuio included: PHP's PHP_SETUP_ICU +# requires the icu-io pkg-config module, so intl won't configure without it) --- +for lib in libicuuc.a libicui18n.a libicuio.a libicudata.a; do + [ -f "$INSTALL_DIR/lib/$lib" ] || { echo "ERROR: missing $INSTALL_DIR/lib/$lib" >&2; exit 1; } +done +echo "==> ICU build complete." +ls -lh "$INSTALL_DIR/lib/"*.a "$INSTALL_DIR/share/icu.dat" diff --git a/packages/registry/icu/build.toml b/packages/registry/icu/build.toml new file mode 100644 index 000000000..731645e12 --- /dev/null +++ b/packages/registry/icu/build.toml @@ -0,0 +1,12 @@ +script_path = "packages/registry/icu/build-icu.sh" +repo_url = "https://github.com/Automattic/kandelo.git" +commit = "e96fbe3964d5ec6784f00f4d49bfcf70a2030e22" +# Revision 3: ICU static libs rebuilt with -fPIC so they can be absorbed into +# intl.so (a wasm side module built with -shared --experimental-pic, which +# requires all inputs to be position-independent). +# Revision 4: Stage-1 host tools statically link the GNU C++/GCC runtime on +# Linux so icupkg/pkgdata do not need libstdc++.so.6 on the Nix CI runner. +revision = 4 + +[binary] +index_url = "https://github.com/Automattic/kandelo/releases/download/binaries-abi-v{abi}/index.toml" diff --git a/packages/registry/icu/package.toml b/packages/registry/icu/package.toml new file mode 100644 index 000000000..355f39786 --- /dev/null +++ b/packages/registry/icu/package.toml @@ -0,0 +1,45 @@ +kind = "library" +name = "icu" +version = "74.2" +kernel_abi = 7 +# ICU4C is C++; it links libc++/libc++abi from the libcxx package. +depends_on = ["libcxx@21.1.7"] +# wasm32 only for now — the sole consumer is PHP's intl side module, +# which is a wasm32 build. Add wasm64 only if a wasm64 consumer appears. +arches = ["wasm32"] + +# ICU4C for wasm32, built as static libraries: +# lib/libicuuc.a (common) +# lib/libicui18n.a (internationalization) +# lib/libicudata.a (stubdata — real data lives in the .dat, see below) +# include/unicode/*.h +# share/icu.dat (common data archive; renamed from icudt74l.dat per +# the intl side-module design — loaded at runtime via +# udata_setCommonData, NOT ICU's default name search) +# +# ICU requires a two-stage build: a HOST build (to generate the data and the +# genrb/pkgdata/icupkg tools) followed by a wasm32 cross build pointed at the +# host build via --with-cross-build. Data is built in `archive` packaging mode +# so the common data is emitted as a standalone icudt74l.dat instead of being +# linked into libicudata.a; we stage that file as `icu.dat`. + +[source] +url = "https://github.com/unicode-org/icu/releases/download/release-74-2/icu4c-74_2-src.tgz" +sha256 = "68db082212a96d6f53e35d60f47d38b962e9f9d207a74cfac78029ae8ff5e08c" + +[license] +spdx = "ICU" +url = "https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE" + +[build] +script_path = "packages/registry/icu/build-icu.sh" + +[outputs] +libs = ["lib/libicuuc.a", "lib/libicui18n.a", "lib/libicuio.a", "lib/libicudata.a"] +headers = ["include/unicode"] +# NOTE: the build also installs share/icu.dat (the ICU common data archive, +# staged from icudtl.dat). It is not declarable as a library output here +# (Outputs only takes libs/headers/pkgconfig); consumers such as PHP's intl +# side module read it directly from the resolved dep dir +# ($WASM_POSIX_DEP_ICU_DIR/share/icu.dat) and load it at runtime via +# udata_setCommonData(). See build-icu.sh. diff --git a/packages/registry/libcxx/.gitignore b/packages/registry/libcxx/.gitignore index 201dd024e..d13a937fe 100644 --- a/packages/registry/libcxx/.gitignore +++ b/packages/registry/libcxx/.gitignore @@ -1,5 +1,7 @@ node_modules/ # LLVM source clone (per major version) — build script populates on demand. llvm-project-*/ +# Per-arch LLVM source symlink tree the build script recreates each run. +llvm-source-*/ # Per-arch build trees produced by the build script. build-*/ diff --git a/packages/registry/libcxx/build-libcxx.sh b/packages/registry/libcxx/build-libcxx.sh index 96396ed8f..a89b35af2 100755 --- a/packages/registry/libcxx/build-libcxx.sh +++ b/packages/registry/libcxx/build-libcxx.sh @@ -103,7 +103,10 @@ if [ ! -d "$NIX_LIBUNWIND_SOURCE/libunwind" ]; then fi BUILD_DIR="$SCRIPT_DIR/build-${ARCH}" -LLVM_SRC_DIR="$BUILD_DIR/llvm-source" +# Assembled source tree lives OUTSIDE the build dirs so both the default +# (static, non-PIC) build and the position-independent build below can share it +# without one build's `rm -rf` deleting the other's source. +LLVM_SRC_DIR="$SCRIPT_DIR/llvm-source-${ARCH}" # --- Verify prerequisites --- if [ ! -f "$SYSROOT/lib/libc.a" ]; then @@ -136,10 +139,9 @@ echo "==> Building libc++ and libc++abi for ${ARCH}..." # SDK's `compileFlags` was updated in lock-step). WASM_C_FLAGS="--target=${WASM_TARGET} -matomics -mbulk-memory -mexception-handling -mllvm -wasm-enable-sjlj -mllvm -wasm-use-legacy-eh=false -fexceptions -fno-trapping-math --sysroot=${SYSROOT} -O2 -DNDEBUG" -# Always start with a fresh build tree so a cache-miss rebuild does -# not mix old + new cmake artifacts. -rm -rf "$BUILD_DIR" -mkdir -p "$BUILD_DIR" +# Start with a fresh source tree so a cache-miss rebuild does not mix old + new +# artifacts. (Each build dir is cleaned by build_libcxx_variant below.) +rm -rf "$LLVM_SRC_DIR" # Assemble the monorepo-shaped source tree expected by runtimes/CMakeLists.txt # from exact Nix source derivations. Nix's libcxx source carries runtimes/, @@ -160,63 +162,77 @@ for entry in "$NIX_LIBCXX_SOURCE/runtimes"/*; do done ln -s "$NIX_LIBUNWIND_SOURCE/libunwind" "$LLVM_SRC_DIR/libunwind" -cd "$BUILD_DIR" +NPROC="$(sysctl -n hw.ncpu 2>/dev/null || nproc)" -cmake -G "Unix Makefiles" -S "$LLVM_SRC_DIR/runtimes" \ - -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \ - -DCMAKE_SYSTEM_NAME=Generic \ - -DCMAKE_SYSTEM_PROCESSOR="${ARCH}" \ - -DCMAKE_C_COMPILER="$LLVM_CLANG" \ - -DCMAKE_CXX_COMPILER="$LLVM_CLANG" \ - -DCMAKE_AR="$LLVM_AR" \ - -DCMAKE_RANLIB="$LLVM_RANLIB" \ - -DCMAKE_NM="$LLVM_NM" \ - -DCMAKE_C_COMPILER_TARGET="${WASM_TARGET}" \ - -DCMAKE_CXX_COMPILER_TARGET="${WASM_TARGET}" \ - -DCMAKE_C_FLAGS="${WASM_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${WASM_C_FLAGS}" \ - -DCMAKE_SYSROOT="${SYSROOT}" \ - -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \ - \ - -DLIBCXX_ENABLE_SHARED=OFF \ - -DLIBCXX_ENABLE_STATIC=ON \ - -DLIBCXX_ENABLE_EXCEPTIONS=ON \ - -DLIBCXX_ENABLE_RTTI=ON \ - -DLIBCXX_HAS_MUSL_LIBC=ON \ - -DLIBCXX_HAS_PTHREAD_API=ON \ - -DLIBCXX_CXX_ABI=libcxxabi \ - -DLIBCXX_INCLUDE_BENCHMARKS=OFF \ - -DLIBCXX_INCLUDE_TESTS=OFF \ - -DLIBCXX_ENABLE_FILESYSTEM=ON \ - -DLIBCXX_ENABLE_MONOTONIC_CLOCK=ON \ - -DLIBCXX_ENABLE_RANDOM_DEVICE=OFF \ - -DLIBCXX_ENABLE_LOCALIZATION=ON \ - -DLIBCXX_ENABLE_WIDE_CHARACTERS=ON \ - -DLIBCXX_ENABLE_NEW_DELETE_DEFINITIONS=ON \ - \ - -DLIBCXXABI_ENABLE_SHARED=OFF \ - -DLIBCXXABI_ENABLE_STATIC=ON \ - -DLIBCXXABI_ENABLE_EXCEPTIONS=ON \ - -DLIBCXXABI_USE_LLVM_UNWINDER=ON \ - -DLIBCXXABI_ENABLE_STATIC_UNWINDER=ON \ - -DLIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY=ON \ - -DLIBCXXABI_ENABLE_THREADS=ON \ - -DLIBCXXABI_HAS_PTHREAD_API=ON \ - -DLIBCXXABI_INCLUDE_TESTS=OFF \ - \ - -DLIBUNWIND_ENABLE_SHARED=OFF \ - -DLIBUNWIND_ENABLE_STATIC=ON \ - -DLIBUNWIND_ENABLE_THREADS=ON \ - -DLIBUNWIND_USE_COMPILER_RT=OFF \ - -DLIBUNWIND_INCLUDE_TESTS=OFF \ - -DLIBUNWIND_HIDE_SYMBOLS=ON \ - \ - -DCMAKE_SIZEOF_VOID_P="${SIZEOF_VOID_P}" \ - 2>&1 | tail -20 +# Configure + build libc++/libc++abi/libunwind into with the given +# compile-flags string (plus any extra cmake args). Factored so the default +# static archives and the position-independent variant (below) share ONE cmake +# recipe and cannot drift apart. +build_libcxx_variant() { + local variant_build_dir="$1"; shift + local variant_c_flags="$1"; shift + rm -rf "$variant_build_dir" + mkdir -p "$variant_build_dir" + ( cd "$variant_build_dir" + cmake -G "Unix Makefiles" -S "$LLVM_SRC_DIR/runtimes" \ + -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \ + -DCMAKE_SYSTEM_NAME=Generic \ + -DCMAKE_SYSTEM_PROCESSOR="${ARCH}" \ + -DCMAKE_C_COMPILER="$LLVM_CLANG" \ + -DCMAKE_CXX_COMPILER="$LLVM_CLANG" \ + -DCMAKE_AR="$LLVM_AR" \ + -DCMAKE_RANLIB="$LLVM_RANLIB" \ + -DCMAKE_NM="$LLVM_NM" \ + -DCMAKE_C_COMPILER_TARGET="${WASM_TARGET}" \ + -DCMAKE_CXX_COMPILER_TARGET="${WASM_TARGET}" \ + -DCMAKE_C_FLAGS="${variant_c_flags}" \ + -DCMAKE_CXX_FLAGS="${variant_c_flags}" \ + -DCMAKE_SYSROOT="${SYSROOT}" \ + -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \ + \ + -DLIBCXX_ENABLE_SHARED=OFF \ + -DLIBCXX_ENABLE_STATIC=ON \ + -DLIBCXX_ENABLE_EXCEPTIONS=ON \ + -DLIBCXX_ENABLE_RTTI=ON \ + -DLIBCXX_HAS_MUSL_LIBC=ON \ + -DLIBCXX_HAS_PTHREAD_API=ON \ + -DLIBCXX_CXX_ABI=libcxxabi \ + -DLIBCXX_INCLUDE_BENCHMARKS=OFF \ + -DLIBCXX_INCLUDE_TESTS=OFF \ + -DLIBCXX_ENABLE_FILESYSTEM=ON \ + -DLIBCXX_ENABLE_MONOTONIC_CLOCK=ON \ + -DLIBCXX_ENABLE_RANDOM_DEVICE=OFF \ + -DLIBCXX_ENABLE_LOCALIZATION=ON \ + -DLIBCXX_ENABLE_WIDE_CHARACTERS=ON \ + -DLIBCXX_ENABLE_NEW_DELETE_DEFINITIONS=ON \ + \ + -DLIBCXXABI_ENABLE_SHARED=OFF \ + -DLIBCXXABI_ENABLE_STATIC=ON \ + -DLIBCXXABI_ENABLE_EXCEPTIONS=ON \ + -DLIBCXXABI_USE_LLVM_UNWINDER=ON \ + -DLIBCXXABI_ENABLE_STATIC_UNWINDER=ON \ + -DLIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY=ON \ + -DLIBCXXABI_ENABLE_THREADS=ON \ + -DLIBCXXABI_HAS_PTHREAD_API=ON \ + -DLIBCXXABI_INCLUDE_TESTS=OFF \ + \ + -DLIBUNWIND_ENABLE_SHARED=OFF \ + -DLIBUNWIND_ENABLE_STATIC=ON \ + -DLIBUNWIND_ENABLE_THREADS=ON \ + -DLIBUNWIND_USE_COMPILER_RT=OFF \ + -DLIBUNWIND_INCLUDE_TESTS=OFF \ + -DLIBUNWIND_HIDE_SYMBOLS=ON \ + \ + -DCMAKE_SIZEOF_VOID_P="${SIZEOF_VOID_P}" \ + "$@" \ + 2>&1 | tail -20 -echo "==> Compiling (this may take a few minutes)..." -NPROC="$(sysctl -n hw.ncpu 2>/dev/null || nproc)" -make -j"$NPROC" cxx cxxabi unwind 2>&1 | tail -10 + echo "==> Compiling (this may take a few minutes)..." + make -j"$NPROC" cxx cxxabi unwind 2>&1 | tail -10 ) +} + +echo "==> Building default (static, non-PIC) libc++/libc++abi for ${ARCH}..." +build_libcxx_variant "$BUILD_DIR" "${WASM_C_FLAGS}" # --- Install into the resolver's OUT_DIR --- echo "==> Installing to $INSTALL_DIR..." @@ -298,7 +314,32 @@ if ! "$LLVM_CLANG" ${WASM_C_FLAGS} \ fi echo "==> Header smoke compile passed." +# --- Position-independent variant for wasm side modules --- +# The default archives above are non-PIC, which is correct for the common case: +# static linking into a main wasm module (php.wasm, mariadb, ruby). But a wasm +# SIDE MODULE (built with `-shared --experimental-pic`, e.g. PHP's intl.so, which +# statically absorbs libc++/libc++abi) requires EVERY input object to be +# position-independent, or wasm-ld fails with "relocation R_WASM_MEMORY_ADDR_SLEB +# cannot be used against symbol ...; recompile with -fPIC". Emit a parallel PIC +# pair alongside the defaults. This is purely additive: libc++.a / libc++abi.a +# and the header set above are untouched, so existing static consumers are +# unaffected; only side-module consumers reach for the -pic archives. +echo "==> Building position-independent libc++/libc++abi (for wasm side modules)..." +PIC_BUILD_DIR="$SCRIPT_DIR/build-${ARCH}-pic" +build_libcxx_variant "$PIC_BUILD_DIR" "${WASM_C_FLAGS} -fPIC" -DCMAKE_POSITION_INDEPENDENT_CODE=ON + +LIBCXX_PIC_A=$(find "$PIC_BUILD_DIR" -name "libc++.a" -not -path "*/CMakeFiles/*" | head -1) +LIBCXXABI_PIC_A=$(find "$PIC_BUILD_DIR" -name "libc++abi.a" -not -path "*/CMakeFiles/*" | head -1) +if [ -z "$LIBCXX_PIC_A" ] || [ -z "$LIBCXXABI_PIC_A" ]; then + echo "ERROR: PIC libraries not found under $PIC_BUILD_DIR" >&2 + exit 1 +fi +cp "$LIBCXX_PIC_A" "$INSTALL_DIR/lib/libc++-pic.a" +cp "$LIBCXXABI_PIC_A" "$INSTALL_DIR/lib/libc++abi-pic.a" + echo "==> Done!" -echo " libc++.a: $(wc -c < "$INSTALL_DIR/lib/libc++.a" | tr -d ' ') bytes" -echo " libc++abi.a: $(wc -c < "$INSTALL_DIR/lib/libc++abi.a" | tr -d ' ') bytes" -echo " headers: $INSTALL_DIR/include/c++/v1/" +echo " libc++.a: $(wc -c < "$INSTALL_DIR/lib/libc++.a" | tr -d ' ') bytes" +echo " libc++abi.a: $(wc -c < "$INSTALL_DIR/lib/libc++abi.a" | tr -d ' ') bytes" +echo " libc++-pic.a: $(wc -c < "$INSTALL_DIR/lib/libc++-pic.a" | tr -d ' ') bytes" +echo " libc++abi-pic.a: $(wc -c < "$INSTALL_DIR/lib/libc++abi-pic.a" | tr -d ' ') bytes" +echo " headers: $INSTALL_DIR/include/c++/v1/" diff --git a/packages/registry/libcxx/build.toml b/packages/registry/libcxx/build.toml index 875768a9e..9e316ba77 100644 --- a/packages/registry/libcxx/build.toml +++ b/packages/registry/libcxx/build.toml @@ -7,7 +7,10 @@ commit = "8c53383229fab78f97b098c3207a655159c03041" # source derivations, hard-fails on compiler/source version drift, and # installs headers from the build tree so the header set cannot drift # from the built library. -revision = 5 +# Revision 6: additionally emits position-independent libc++-pic.a / +# libc++abi-pic.a (a second -fPIC build) so wasm side modules like PHP's +# intl.so can statically absorb libc++. The non-PIC pair is unchanged. +revision = 6 [binary] index_url = "https://github.com/Automattic/kandelo/releases/download/binaries-abi-v{abi}/index.toml" diff --git a/packages/registry/libcxx/package.toml b/packages/registry/libcxx/package.toml index 49d406570..9fcc01e8c 100644 --- a/packages/registry/libcxx/package.toml +++ b/packages/registry/libcxx/package.toml @@ -42,5 +42,11 @@ url = "https://github.com/llvm/llvm-project/blob/main/LICENSE.TXT" script_path = "packages/registry/libcxx/build-libcxx.sh" [outputs] -libs = ["lib/libc++.a", "lib/libc++abi.a"] +# libc++.a / libc++abi.a are the default non-PIC archives for static linking +# into a main wasm module (php.wasm, mariadb, ruby). libc++-pic.a / +# libc++abi-pic.a are the position-independent variants required by wasm SIDE +# MODULES (`-shared --experimental-pic`), e.g. PHP's intl.so, which statically +# absorbs libc++ and would otherwise hit a wasm-ld "recompile with -fPIC" error. +# Additive: existing consumers keep using the non-PIC pair unchanged. +libs = ["lib/libc++.a", "lib/libc++abi.a", "lib/libc++-pic.a", "lib/libc++abi-pic.a"] headers = ["include/c++/v1"] diff --git a/packages/registry/php/build-php.sh b/packages/registry/php/build-php.sh index 0407f915a..a6be673a1 100755 --- a/packages/registry/php/build-php.sh +++ b/packages/registry/php/build-php.sh @@ -48,20 +48,51 @@ OPENSSL_PREFIX="${WASM_POSIX_DEP_OPENSSL_DIR:-}" [ -z "$OPENSSL_PREFIX" ] && { echo "==> Resolving openssl..."; OPENSSL_PREFIX="$(resolve_dep openssl)"; } LIBXML2_PREFIX="${WASM_POSIX_DEP_LIBXML2_DIR:-}" [ -z "$LIBXML2_PREFIX" ] && { echo "==> Resolving libxml2..."; LIBXML2_PREFIX="$(resolve_dep libxml2)"; } +# ICU + libcxx back the intl side module only; they are linked into intl.so, not +# php.wasm (see the intl.so build below), so base PHP stays ICU-free. +ICU_PREFIX="${WASM_POSIX_DEP_ICU_DIR:-}" +[ -z "$ICU_PREFIX" ] && { echo "==> Resolving icu..."; ICU_PREFIX="$(resolve_dep icu)"; } +LIBCXX_PREFIX="${WASM_POSIX_DEP_LIBCXX_DIR:-}" +[ -z "$LIBCXX_PREFIX" ] && { echo "==> Resolving libcxx..."; LIBCXX_PREFIX="$(resolve_dep libcxx)"; } [ -f "$ZLIB_PREFIX/lib/libz.a" ] || { echo "ERROR: zlib resolve missing libz.a"; exit 1; } [ -f "$SQLITE_PREFIX/lib/libsqlite3.a" ] || { echo "ERROR: sqlite resolve missing libsqlite3.a"; exit 1; } [ -f "$OPENSSL_PREFIX/lib/libssl.a" ] || { echo "ERROR: openssl resolve missing libssl.a"; exit 1; } [ -f "$LIBXML2_PREFIX/lib/libxml2.a" ] || { echo "ERROR: libxml2 resolve missing libxml2.a"; exit 1; } +[ -f "$ICU_PREFIX/lib/libicuuc.a" ] || { echo "ERROR: icu resolve missing libicuuc.a"; exit 1; } +[ -f "$ICU_PREFIX/share/icu.dat" ] || { echo "ERROR: icu resolve missing share/icu.dat"; exit 1; } +[ -f "$LIBCXX_PREFIX/lib/libc++.a" ] || { echo "ERROR: libcxx resolve missing libc++.a"; exit 1; } +# A -shared PIC side module needs the position-independent libc++ (libcxx +# revision >= 6); the non-PIC pair above is for the main php.wasm link. +[ -f "$LIBCXX_PREFIX/lib/libc++-pic.a" ] || { echo "ERROR: libcxx resolve missing libc++-pic.a — rebuild libcxx (revision >= 6)"; exit 1; } +[ -f "$LIBCXX_PREFIX/lib/libc++abi-pic.a" ] || { echo "ERROR: libcxx resolve missing libc++abi-pic.a — rebuild libcxx (revision >= 6)"; exit 1; } echo "==> zlib at $ZLIB_PREFIX" echo "==> sqlite at $SQLITE_PREFIX" echo "==> openssl at $OPENSSL_PREFIX" echo "==> libxml2 at $LIBXML2_PREFIX" +echo "==> icu at $ICU_PREFIX" +echo "==> libcxx at $LIBCXX_PREFIX" -# Compose PKG_CONFIG_PATH for all 4 deps so wasm32posix-configure's -# pkg-config probes can find them in the cache instead of the sysroot. -DEP_PKG_CONFIG_PATH="$ZLIB_PREFIX/lib/pkgconfig:$SQLITE_PREFIX/lib/pkgconfig:$OPENSSL_PREFIX/lib/pkgconfig:$LIBXML2_PREFIX/lib/pkgconfig" +# Make libc++ visible in the sysroot so ext/intl (C++) compiles and intl.so +# links against it (mirrors packages/registry/mariadb/build-mariadb.sh). +mkdir -p "$SYSROOT/lib" "$SYSROOT/include/c++" +ln -sf "$LIBCXX_PREFIX/lib/libc++.a" "$SYSROOT/lib/libc++.a" +ln -sf "$LIBCXX_PREFIX/lib/libc++abi.a" "$SYSROOT/lib/libc++abi.a" +rm -rf "$SYSROOT/include/c++/v1" +ln -sfn "$LIBCXX_PREFIX/include/c++/v1" "$SYSROOT/include/c++/v1" +# Enabling a C++ extension makes PHP's PHP_REQUIRE_CXX append -lstdc++ to the +# main SAPI link (upstream assumes GNU libstdc++, but our runtime is LLVM +# libc++). intl bundles its own libc++, so the main SAPIs reference no C++ +# symbols and -lstdc++ only needs to resolve — bridge the name to our libc++. +ln -sf "$LIBCXX_PREFIX/lib/libc++.a" "$SYSROOT/lib/libstdc++.a" + +# Compose PKG_CONFIG_PATH so wasm32posix-configure's pkg-config probes can find +# the deps in the cache instead of the sysroot. ICU is included so PHP_SETUP_ICU +# detects it and enables the (shared) intl extension. +DEP_PKG_CONFIG_PATH="$ZLIB_PREFIX/lib/pkgconfig:$SQLITE_PREFIX/lib/pkgconfig:$OPENSSL_PREFIX/lib/pkgconfig:$LIBXML2_PREFIX/lib/pkgconfig:$ICU_PREFIX/lib/pkgconfig" # Compose -I and -L flags for defense-in-depth (autoconf raw probes). +# ICU's -I/-L are deliberately omitted so ICU can't leak into the main link; +# ext/intl gets ICU_CFLAGS/ICU_LIBS from configure and intl.so links ICU below. DEP_CPPFLAGS="-I$ZLIB_PREFIX/include -I$SQLITE_PREFIX/include -I$OPENSSL_PREFIX/include -I$LIBXML2_PREFIX/include" DEP_LDFLAGS="-L$ZLIB_PREFIX/lib -L$SQLITE_PREFIX/lib -L$OPENSSL_PREFIX/lib -L$LIBXML2_PREFIX/lib" @@ -165,6 +196,12 @@ if [ ! -f Makefile ]; then # invokes on our wasm port — but the import has to resolve at # instantiation time). # + # The second -u group forces libc symbols intl.so imports but base PHP + # never references (allocator, wide-char, math, and the pthread mutex/ + # cond/TLS that ICU's UMutex uses). They must resolve to php.wasm's own + # musl so intl.so shares one libc state — one allocator, one pthread key + # table; without -u they never enter php.wasm and intl.so fails to load. + # # -Wl,-z,stack-size=4194304: 4 MB wasm stack. The default wasm-ld # stack is 64 KB, which sits ~100 KB above PHP's `alloc_globals` # data segment. Opcache's PASS_6 (DFA-based SSA optimization) calls @@ -181,6 +218,13 @@ if [ ! -f Makefile ]; then CPPFLAGS="$DEP_CPPFLAGS" \ LDFLAGS="$DEP_LDFLAGS -ldl -Wl,--export-all \ -u setgid -u setuid -u initgroups -u writev -u asctime \ +-u aligned_alloc -u div -u modf -u round -u tanhf \ +-u swprintf -u wcstod -u wcstof -u wcstol -u wcstold \ +-u wcstoll -u wcstoul -u wcstoull -u wmemchr -u wmemcmp \ +-u pthread_cond_broadcast -u pthread_cond_destroy -u pthread_cond_signal \ +-u pthread_cond_timedwait -u pthread_cond_wait -u pthread_detach \ +-u pthread_getspecific -u pthread_key_create -u pthread_self \ +-u pthread_setspecific \ -Wl,-z,stack-size=4194304" \ wasm32posix-configure \ --disable-all \ @@ -189,6 +233,7 @@ if [ ! -f Makefile ]; then --enable-cli \ --enable-fpm \ --enable-opcache \ + --enable-intl=shared \ --enable-mbstring \ --disable-mbregex \ --enable-ctype \ @@ -307,6 +352,39 @@ wasm32posix-cc -shared -fPIC -o "$SCRIPT_DIR/bin/opcache.so" \ ext/opcache/.libs/shared_alloc_posix.o echo "==> opcache.so: $(wc -c < "$SCRIPT_DIR/bin/opcache.so") bytes" +# Build intl as a shared .so, same libtool workaround as opcache: make compiles +# the PIC objects under ext/intl/**/.libs/ but the bundled libtool can't emit the +# final .so on this target, so we link it with `wasm32posix-cc -shared`. intl +# statically absorbs ICU and libc++/libc++abi so neither enters php.wasm; the ICU +# common data stays out of the .so as icu.dat (loaded by intl-icu-data-loader.c). +echo "==> Building intl.so (PHP extension)..." +make -j"$(sysctl -n hw.ncpu 2>/dev/null || nproc)" EXTRA_CFLAGS="$EXTRA_INC_LIBXML" ext/intl/intl.la || true + +# Compile the icu.dat loader (PIC) that feeds ICU its common data at dlopen. +wasm32posix-cc -fPIC -O2 -c "$SCRIPT_DIR/intl-icu-data-loader.c" \ + -I"$ICU_PREFIX/include" -o ext/intl/kandelo_icu_data_loader.o + +# Collect every PIC object libtool produced for ext/intl (top dir + the +# collator/, dateformat/, formatter/, … subdirs each have their own .libs/). +mapfile -t INTL_OBJS < <(find ext/intl -path '*/.libs/*.o' | sort) +[ "${#INTL_OBJS[@]}" -gt 0 ] || { echo "ERROR: no ext/intl PIC objects found — did 'make ext/intl/intl.la' compile?" >&2; exit 1; } +echo "==> linking intl.so from ${#INTL_OBJS[@]} objects + ICU static libs + libc++" + +# wasm-ld resolves archive back-references without --start-group, so the ICU +# archives are listed in dependency order (i18n -> io -> uc -> data), then +# libc++/libc++abi. A -shared PIC module requires every input to be PIC, so the +# libc++ PIC variants are named explicitly to win over the non-PIC sysroot ones. +wasm32posix-cc -shared -fPIC -o "$SCRIPT_DIR/bin/intl.so" \ + "${INTL_OBJS[@]}" \ + ext/intl/kandelo_icu_data_loader.o \ + "$ICU_PREFIX/lib/libicui18n.a" \ + "$ICU_PREFIX/lib/libicuio.a" \ + "$ICU_PREFIX/lib/libicuuc.a" \ + "$ICU_PREFIX/lib/libicudata.a" \ + "$LIBCXX_PREFIX/lib/libc++-pic.a" \ + "$LIBCXX_PREFIX/lib/libc++abi-pic.a" +echo "==> intl.so: $(wc -c < "$SCRIPT_DIR/bin/intl.so") bytes" + # Copy to bin/ with .wasm extension (needed for Vite browser demos) mkdir -p "$SCRIPT_DIR/bin" cp sapi/cli/php "$SCRIPT_DIR/bin/php.wasm" @@ -345,3 +423,4 @@ source "$REPO_ROOT/scripts/install-local-binary.sh" install_local_binary php "$SCRIPT_DIR/bin/php.wasm" php.wasm install_local_binary php "$SCRIPT_DIR/bin/php-fpm.wasm" php-fpm.wasm install_local_binary php "$SCRIPT_DIR/bin/opcache.so" +install_local_binary php "$SCRIPT_DIR/bin/intl.so" diff --git a/packages/registry/php/build.toml b/packages/registry/php/build.toml index eae64edbc..27c7c7861 100644 --- a/packages/registry/php/build.toml +++ b/packages/registry/php/build.toml @@ -1,7 +1,7 @@ script_path = "packages/registry/php/build-php.sh" repo_url = "https://github.com/brandonpayton/kandelo.git" commit = "8c53383229fab78f97b098c3207a655159c03041" -revision = 3 +revision = 4 [binary] index_url = "https://github.com/Automattic/kandelo/releases/download/binaries-abi-v{abi}/index.toml" diff --git a/packages/registry/php/intl-icu-data-loader.c b/packages/registry/php/intl-icu-data-loader.c new file mode 100644 index 000000000..77f5e60a5 --- /dev/null +++ b/packages/registry/php/intl-icu-data-loader.c @@ -0,0 +1,109 @@ +/* + * Feeds ICU its common data at intl.so load time. + * + * ICU ships as the standalone file icu.dat (see packages/registry/icu), but + * ICU's automatic loader only looks for the conventional icudt.dat + * name and would never find icu.dat on its own. So instead of embedding the + * ~30 MB blob in the .so, we hand it to ICU via udata_setCommonData() from a + * constructor: the side-module loader runs __wasm_call_ctors before PHP calls + * intl's MINIT, so the data is in place before any ICU service touches it. + * + * A missing/unreadable icu.dat is non-fatal at load (intl.so may be present + * without any code using intl) but stays loud: we warn to stderr and let ICU + * fail with U_MISSING_RESOURCE_ERROR when a service actually needs data, rather + * than silently succeeding. Path defaults to /usr/lib/php/icu.dat, overridable + * via KANDELO_ICU_DAT_PATH for VFS images that stage it elsewhere. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define KANDELO_ICU_DAT_DEFAULT "/usr/lib/php/icu.dat" + +static void kandelo_intl_load_icu_data(void) __attribute__((constructor)); + +static void kandelo_intl_load_icu_data(void) { + const char *path = getenv("KANDELO_ICU_DAT_PATH"); + if (path == NULL || path[0] == '\0') { + path = KANDELO_ICU_DAT_DEFAULT; + } + + int fd = open(path, O_RDONLY); + if (fd < 0) { + fprintf(stderr, + "[intl] ICU data not loaded: cannot open %s. " + "intl functions will fail with U_MISSING_RESOURCE_ERROR. " + "Set KANDELO_ICU_DAT_PATH or stage icu.dat there.\n", + path); + return; + } + + struct stat st; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + fprintf(stderr, "[intl] ICU data not loaded: cannot stat %s.\n", path); + close(fd); + return; + } + + /* + * ICU keeps this pointer for the life of the process, so the buffer must + * outlive this function and is deliberately never freed. A plain read (not + * mmap) sidesteps the VFS's emulated mmap and runs once per process. + */ + size_t size = (size_t) st.st_size; + void *buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "[intl] ICU data not loaded: OOM reading %s (%zu bytes).\n", + path, size); + close(fd); + return; + } + + size_t off = 0; + while (off < size) { + ssize_t n = read(fd, (char *) buf + off, size - off); + if (n < 0) { + fprintf(stderr, "[intl] ICU data not loaded: read error on %s.\n", path); + free(buf); + close(fd); + return; + } + if (n == 0) break; + off += (size_t) n; + } + close(fd); + + if (off != size) { + fprintf(stderr, "[intl] ICU data not loaded: short read on %s (%zu/%zu).\n", + path, off, size); + free(buf); + return; + } + + UErrorCode status = U_ZERO_ERROR; + udata_setCommonData(buf, &status); + if (U_FAILURE(status)) { + fprintf(stderr, + "[intl] udata_setCommonData(%s) failed: %s. " + "(Likely an ICU library/data version mismatch.)\n", + path, u_errorName(status)); + free(buf); + return; + } + + /* Force ICU to validate/initialize now so version skew surfaces at load. */ + status = U_ZERO_ERROR; + u_init(&status); + if (U_FAILURE(status)) { + fprintf(stderr, "[intl] u_init after loading %s failed: %s.\n", + path, u_errorName(status)); + } +} diff --git a/packages/registry/php/package.toml b/packages/registry/php/package.toml index 709e8699c..0051117cf 100644 --- a/packages/registry/php/package.toml +++ b/packages/registry/php/package.toml @@ -2,7 +2,7 @@ kind = "program" name = "php" version = "8.3.2" kernel_abi = 7 -depends_on = ["zlib@1.3.1", "openssl@3.3.2", "sqlite@3.49.1", "libxml2@2.13.8"] +depends_on = ["zlib@1.3.1", "openssl@3.3.2", "sqlite@3.49.1", "libxml2@2.13.8", "icu@74.2", "libcxx@21.1.7"] # wasm32 only. PHP previously declared wasm64 too (per # memory/wasm64-build-policy.md), but no demo actually consumes # `programs/wasm64/php/...` — only mariadb needs the 4 GB address @@ -48,3 +48,16 @@ wasm = "php-fpm.wasm" [[outputs]] name = "opcache" wasm = "opcache.so" + +# intl shipped as a fourth output: a regular PHP extension (.so side module) +# that PHP loads via `extension=intl.so` at MINIT — runtime-optional, so base +# php.wasm carries no ICU. It statically absorbs ICU (icu@74.2) and +# libc++/libc++abi; the ~30 MB ICU common data is NOT embedded but ships +# separately as icu.dat (from the icu package's share/icu.dat), loaded at +# runtime via udata_setCommonData() by intl-icu-data-loader.c. build-php.sh +# links it with `wasm32posix-cc -shared` from ext/intl/**/.libs/*.o for the +# same reason opcache is hand-linked (bundled libtool won't emit a shared +# module on this cross target). +[[outputs]] +name = "intl" +wasm = "intl.so" diff --git a/packages/registry/php/test/php-intl.test.ts b/packages/registry/php/test/php-intl.test.ts new file mode 100644 index 000000000..7fbb65598 --- /dev/null +++ b/packages/registry/php/test/php-intl.test.ts @@ -0,0 +1,95 @@ +import { describe, it, expect } from "vitest"; +import { existsSync, readdirSync, statSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import { homedir } from "node:os"; +import { runCentralizedProgram } from "../../../../host/test/centralized-test-helper"; +import { tryResolveBinary } from "../../../../host/src/binary-resolver"; +import { NodePlatformIO } from "../../../../host/src/platform/node"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +// intl is a RUNTIME-OPTIONAL side module: base php.wasm is built with +// --enable-intl=shared, so intl is NOT compiled in. intl.so is loaded on +// demand via `extension=intl.so`, and pulls its ICU common data from the +// separate icu.dat at runtime (udata_setCommonData in intl-icu-data-loader.c). +const phpBinaryPath = + tryResolveBinary("programs/php/php.wasm") ?? + join(__dirname, "../php-src/sapi/cli/php"); +const intlSoPath = tryResolveBinary("programs/php/intl.so"); + +// icu.dat lives in the icu package's resolver cache dir. Pick the newest +// non-temp build for the current arch. +function findIcuDat(): string | undefined { + const libsDir = join(homedir(), ".cache/kandelo/libs"); + if (!existsSync(libsDir)) return undefined; + const candidates = readdirSync(libsDir) + .filter((n) => n.startsWith("icu-") && n.includes("-wasm32-") && !n.includes(".tmp-")) + .map((n) => join(libsDir, n, "share", "icu.dat")) + .filter((p) => existsSync(p)); + if (candidates.length === 0) return undefined; + return candidates.sort((a, b) => statSync(b).mtimeMs - statSync(a).mtimeMs)[0]; +} +const icuDatPath = findIcuDat(); + +const READY = existsSync(phpBinaryPath) && intlSoPath != null && icuDatPath != null; + +describe.skipIf(!READY)("PHP intl as a runtime-loadable side module", () => { + // Proves the base binary is genuinely ICU-free / intl-free: intl only + // appears when explicitly loaded. This is the whole point of the design. + it("base php.wasm does NOT include intl", async () => { + const { stdout, exitCode } = await runCentralizedProgram({ + programPath: phpBinaryPath, + argv: ["php", "-m"], + // Same host-I/O adapter as the other cases: `php -m` needs no files, + // but it keeps the harness off the "default" rootfs.vfs image (not a + // fixture this package ships) so the run stays self-contained. + io: new NodePlatformIO(), + }); + expect(exitCode).toBe(0); + expect(stdout.toLowerCase()).not.toContain("intl"); + }, 60_000); + + it("loads intl.so at runtime via extension=", async () => { + const { stdout, exitCode } = await runCentralizedProgram({ + programPath: phpBinaryPath, + argv: ["php", "-d", `extension=${intlSoPath}`, "-r", + 'echo extension_loaded("intl") ? "intl-loaded" : "intl-missing";'], + env: [`KANDELO_ICU_DAT_PATH=${icuDatPath}`], + io: new NodePlatformIO(), + }); + expect(stdout).toContain("intl-loaded"); + expect(exitCode).toBe(0); + }, 60_000); + + // Exercises real ICU data (locale display names) to prove icu.dat is + // actually loaded and usable, not just that the module registered. + it("intl uses ICU data (Locale::getDisplayLanguage)", async () => { + const { stdout, exitCode } = await runCentralizedProgram({ + programPath: phpBinaryPath, + argv: ["php", "-d", `extension=${intlSoPath}`, "-r", + 'echo Locale::getDisplayLanguage("fr", "en");'], + env: [`KANDELO_ICU_DAT_PATH=${icuDatPath}`], + io: new NodePlatformIO(), + }); + expect(stdout).toContain("French"); + expect(exitCode).toBe(0); + }, 60_000); + + // Collator sorting is a core ICU service that requires collation data. + it("intl Collator sorts with locale rules", async () => { + const { stdout, exitCode } = await runCentralizedProgram({ + programPath: phpBinaryPath, + argv: ["php", "-d", `extension=${intlSoPath}`, "-r", ` + $c = new Collator("en_US"); + $a = ["banana", "apple", "cherry"]; + $c->sort($a); + echo implode(",", $a); + `], + env: [`KANDELO_ICU_DAT_PATH=${icuDatPath}`], + io: new NodePlatformIO(), + }); + expect(stdout).toContain("apple,banana,cherry"); + expect(exitCode).toBe(0); + }, 60_000); +});