diff --git a/.github/workflows/build-from-source.yml b/.github/workflows/build-from-source.yml new file mode 100644 index 0000000..7a207d8 --- /dev/null +++ b/.github/workflows/build-from-source.yml @@ -0,0 +1,333 @@ +# UNIVERSAL build-from-source workflow — BYTE-IDENTICAL across every Model B fork. +# +# DO NOT customize this file per-fork. If a fork needs different behavior, use +# GitHub Actions repository Variables (Settings → Secrets and variables → +# Actions → Variables): +# +# DOCKERFILE override auto-detected path (default: root Dockerfile, then shallowest) +# BUILD_CONTEXT override docker build context (default: . — repo root) +# PLATFORMS override platform list (default: linux/amd64,linux/arm64) +# BUILD_ARGS multi-line KEY=VALUE pairs passed to docker build (default: none). +# Used by forks whose upstream Dockerfile declares an ARG with no +# default that the upstream's own CI passes externally (e.g. +# tika-docker's TIKA_VERSION via republish-images.sh). +# IS_SOURCE_BUILT job-level gate: build runs only when set to 'true'. Plain-mirror +# forks set 'false' so the workflow file stays installed (uniform +# scaffolding) but the job no-ops; fork-publish.yml owns :latest. +# FREE_DISK_SPACE opt-in (set 'true') for forks whose images blow past the +# 14GB default disk on ubuntu-latest (vllm-class CUDA images). +# Adds ~3 min per build; reclaims ~30GB by removing pre-installed +# Android SDK / .NET / Haskell / large npm caches. +# RUNS_ON per-fork runner-label override (default: "ubuntu-latest"). +# JSON-encoded — single string for one label, or JSON array for +# multiple. Set to '["self-hosted","lan-docker","big-build"]' +# for forks that exceed hosted-runner capacity (vllm). +# BUILD_TIMEOUT_MINUTES per-fork job-timeout override (default 360, the +# GitHub-hosted-runner ceiling). Raise as a ONE-SHOT bootstrap +# on forks whose first cold-cache build exceeds 360min (vllm +# single-arch CUDA: ~5-6h first run, ~30-45m once :buildcache +# is populated). Revert to default after first green run. +# Honest-fact #97. +# PRECOMPILED_WHEEL_BASE_URL master switch for the upstream-prebuilt-wheel +# build path. Set to e.g. `https://wheels.vllm.ai` on forks +# whose upstream publishes per-commit prebuilt wheels AND +# whose carry stack is 100% non-compiled (Python/MD only). +# When set, the build passes ${PREFIX}_USE_PRECOMPILED=1 + the +# resolved commit + variant as build-args, and the fork's +# Dockerfile carry can early-out the local C++/CUDA compile. +# When unset (default for all forks), build path is unchanged. +# sync-upstream.yml writes PRECOMPILED_WHEEL_COMMIT after +# curl-verifying the wheel exists; if no variant in +# PRECOMPILED_WHEEL_VARIANT_CHAIN publishes, sync CLEARS +# PRECOMPILED_WHEEL_COMMIT to empty so the build falls back +# to source compile rather than serving stale-ABI binaries +# (the only safe semantics — a wheel built against SHA-N-1 +# with Python carries against SHA-N's APIs segfaults at +# runtime with no build-time signal). Honest-fact #99. +# PRECOMPILED_WHEEL_COMMIT SHA of the upstream commit whose wheel to fetch. +# Managed by sync-upstream — operator should NOT set this +# directly UNLESS PRECOMPILED_WHEEL_COMMIT_FROZEN=true (then +# sync respects the manual pin). Empty when no wheel is +# currently available; build falls back to source compile. +# PRECOMPILED_WHEEL_VARIANT wheel build variant (e.g. `cu130`, `cu128`). +# Operator-set initial value; sync-upstream updates in-place +# when the variant chain falls through to a different one +# (unless PRECOMPILED_WHEEL_COMMIT_FROZEN=true). +# PRECOMPILED_WHEEL_VARIANT_CHAIN optional. Space-separated fallback list +# (e.g. `cu130 cu128`). Sync-upstream tries each in order; +# first one whose wheel exists wins. Defaults to the single +# PRECOMPILED_WHEEL_VARIANT value. +# PRECOMPILED_WHEEL_BUILD_ARG_PREFIX prefix for the synthesized build-arg +# env names. Default `VLLM` — synthesizes VLLM_USE_PRECOMPILED, +# VLLM_PRECOMPILED_WHEEL_COMMIT, etc. For forks whose upstream +# uses a different convention (MinerU, torchao, future), set +# to that project's prefix (e.g. `MINERU`) → MINERU_USE_PRECOMPILED. +# Keeps the template generic across vendor opt-ins. +# PRECOMPILED_WHEEL_COMMIT_FROZEN set to `true` to pin +# PRECOMPILED_WHEEL_COMMIT/_VARIANT manually — sync-upstream +# will NOT overwrite them when set. Use for testing a specific +# wheel SHA, or for rolling back from a bad upstream commit. +# Default unset (sync auto-manages). +# +# If a fork's Dockerfile needs out-of-context pre-build steps, the right fix is +# to cherry-pick a multi-stage self-contained Dockerfile from upstream into our +# intarweb-dev — NOT to add pre-build logic here. +# +# Codified in oss-contributing:ghcr-fork-mirror skill honest-fact #52. + +name: Build from source → GHCR + +on: + push: + branches: [intarweb-dev] + workflow_dispatch: + +# Single-flight per repo: schedule-driven sync push + Heal K dispatch + manual +# workflow_dispatch can all converge. cancel-in-progress: false because builds +# are expensive (vllm 30-45 min, bifrost 45 min) — let the in-flight one finish, +# queue the next behind it. Honest-fact #74. +concurrency: + group: build-from-source-${{ github.repository }} + cancel-in-progress: false + +permissions: + contents: read + packages: write + +jobs: + build: + # IS_SOURCE_BUILT Variable is the authoritative source-vs-mirror switch + # for the fleet. Setting it to anything other than 'true' (or leaving it + # unset) skips the build entirely. This keeps the WORKFLOW FILE itself + # byte-identical across every fork (uniform infra scaffolding) while + # giving operators a per-fork Variable to control source-build vs + # plain-mirror behavior without removing files or disabling workflows. + # Plain-mirror forks (docker-neo4j etc) still have build-from-source.yml + # installed but the job no-ops; fork-publish.yml owns :latest for them. + # + # Branch gate: when triggered by a push, only run on intarweb-dev. The + # `on.push.branches: [intarweb-dev]` filter above is supposed to handle + # this, but in practice GitHub still records 0-second false-failure runs + # against build-from-source.yml when sync-upstream's bot pushes to main + # (empty jobs[], shows red in Actions UI, head_branch=main, name=workflow + # path rather than display name). Belt-and-suspenders branch gate here + # ensures the job no-ops cleanly to a green "skipped" rather than a + # mysterious red failure if the trigger ever evaluates against main / + # master / any other ref. Manual workflow_dispatch and workflow_call + # paths remain unaffected. Honest-fact #109. + if: >- + vars.IS_SOURCE_BUILT == 'true' + && ( + github.event_name == 'workflow_dispatch' + || github.event_name == 'workflow_call' + || (github.event_name == 'push' && github.ref == 'refs/heads/intarweb-dev') + ) + # Per-fork override via repo Variable RUNS_ON (default: ubuntu-latest hosted). + # Set to '["self-hosted","lan-docker","big-build"]' (JSON array, single line) + # for forks whose builds exceed hosted-runner capacity (16GB RAM / 14GB disk + # — vllm's CUDA csrc-build is the canonical case). Honest-fact #89. + runs-on: ${{ fromJSON(vars.RUNS_ON || '"ubuntu-latest"') }} + # Per-fork override via repo Variable BUILD_TIMEOUT_MINUTES (default 360, the + # GitHub-hosted-runner hard ceiling). Raise to 720 as a ONE-SHOT bootstrap on + # forks whose first build needs to populate :buildcache from scratch (vllm + # at single-arch CUDA: ~5-6h cold-cache vs ~30-45m warm). Revert to 360 once + # warm. Codified after 27340784996 / 27373307060 cancelled at 360min on the + # vllm fork's first cold-cache run. Honest-fact #91. + timeout-minutes: ${{ fromJSON(vars.BUILD_TIMEOUT_MINUTES || '360') }} + # Read-once capture of precompiled-wheel Variables at job start. Subsequent + # steps reference $PRECOMPILED_* env vars (resolved once at job-init) rather + # than ${{ vars.* }} (re-resolved per-step expression). This makes an + # in-flight build immune to mid-job Variable mutations from a concurrent + # sync-upstream writing PRECOMPILED_WHEEL_COMMIT/_VARIANT. Eliminates the + # read/write race architect-review #1 flagged. Honest-fact #99. + env: + PRECOMPILED_BASE_URL: ${{ vars.PRECOMPILED_WHEEL_BASE_URL }} + PRECOMPILED_COMMIT: ${{ vars.PRECOMPILED_WHEEL_COMMIT }} + PRECOMPILED_VARIANT: ${{ vars.PRECOMPILED_WHEEL_VARIANT }} + PRECOMPILED_PREFIX: ${{ vars.PRECOMPILED_WHEEL_BUILD_ARG_PREFIX || 'VLLM' }} + OPERATOR_BUILD_ARGS: ${{ vars.BUILD_ARGS }} + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + + # Optional disk reclamation for forks whose images blow past the 14GB + # default disk on ubuntu-latest hosted runners (vllm being the canonical + # offender — its CUDA build artifacts + intermediate layers exceed 20GB + # easily). Frees ~30GB by removing pre-installed Android SDK, .NET, + # Haskell, large npm caches, and codeql databases. Costs ~3 min — gated + # behind FREE_DISK_SPACE Variable so non-vllm-class forks don't pay it. + - name: 🧹 Free disk space (opt-in via FREE_DISK_SPACE Variable) + if: vars.FREE_DISK_SPACE == 'true' + uses: jlumbroso/free-disk-space@v1.3.1 + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: false + swap-storage: false + + - name: 🔍 Resolve Dockerfile + id: df + env: + DOCKERFILE_VAR: ${{ vars.DOCKERFILE }} + run: | + set -euo pipefail + # 1. Explicit override (repo Variable) always wins. + if [ -n "${DOCKERFILE_VAR}" ]; then + echo " ✓ using DOCKERFILE override: ${DOCKERFILE_VAR}" + echo "path=${DOCKERFILE_VAR}" >> "$GITHUB_OUTPUT" + echo "skip=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + # 2. Prefer a root Dockerfile. + if [ -f Dockerfile ]; then + echo " ✓ found root Dockerfile" + echo "path=Dockerfile" >> "$GITHUB_OUTPUT" + echo "skip=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + # 3. Deterministic discovery: shallowest non-test Dockerfile. + cand="$(find . -maxdepth 4 -type f -name Dockerfile \ + -not -path './node_modules/*' \ + -not -path './.git/*' \ + -not -path '*/test/*' \ + -not -path '*/tests/*' \ + -not -path '*/example*/*' \ + -not -path './vendor/*' \ + -not -path './third_party/*' \ + | sed 's|^\./||' \ + | awk '{print gsub(/\//,"/"), $0}' \ + | sort -n | head -n1 | cut -d' ' -f2- || true)" + if [ -z "$cand" ]; then + echo " ✗ no Dockerfile found → plain-mirror fallback (no image built)" + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo " ✓ auto-found: $cand" + echo "path=$cand" >> "$GITHUB_OUTPUT" + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Set up QEMU + if: steps.df.outputs.skip != 'true' + uses: docker/setup-qemu-action@v3 + + - name: Set up Buildx + if: steps.df.outputs.skip != 'true' + uses: docker/setup-buildx-action@v3 + + - name: Log in to ghcr.io + if: steps.df.outputs.skip != 'true' + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🔡 Compute image (lowercase — GHCR requires lowercase repo names) + id: imgname + run: | + # vars.IMAGE_NAME may already be lowercase (set as such manually), but + # the github.event.repository.name fallback can be CamelCase if the fork + # repo name has uppercase letters (e.g. RetroSaveManager). docker/build-push + # rejects ANY uppercase in the image path. fork-publish.yml does the same + # via tr; we do it here. Honest-fact #63. + NAME="${{ vars.IMAGE_NAME || github.event.repository.name }}" + NAME_LOWER=$(echo "$NAME" | tr '[:upper:]' '[:lower:]') + echo "image=ghcr.io/${{ github.repository_owner }}/${NAME_LOWER}" >> "$GITHUB_OUTPUT" + + - name: 🏷️ Docker metadata + id: meta + if: steps.df.outputs.skip != 'true' + uses: docker/metadata-action@v5 + with: + # Per-fork override via repo Variable IMAGE_NAME — for forks where the + # published image name differs from the repo name (docker-autoheal repo → + # autoheal image). Falls back to repo name. Same as fork-publish.yml so + # ONE image per fork. Lowercased above (metadata-action's lowercase only + # applies to tags, not to the images input — uppercase repo names slip + # through and break build-push with "repository name must be lowercase"). + images: ${{ steps.imgname.outputs.image }} + tags: | + type=raw,value=latest,enable=${{ github.ref == 'refs/heads/intarweb-dev' }} + type=sha,prefix=sha-,format=short + + - name: 🧩 Synthesize build-args (BUILD_ARGS + optional precompiled-wheel envs) + # Combines operator-set BUILD_ARGS with auto-derived precompiled-wheel + # build-args when PRECOMPILED_WHEEL_BASE_URL is set on the fork. For + # forks WITHOUT that Variable (the default), output is byte-identical to + # the pre-existing `build-args: ${{ vars.BUILD_ARGS || '' }}` behavior. + # Uses $PRECOMPILED_* env vars captured at job start (job-level env + # block) — immune to mid-build Variable mutations. Synthesizes envs with + # the configured PREFIX (default `VLLM`), so non-vllm forks adopting this + # pattern (MinerU, torchao, future) just set + # PRECOMPILED_WHEEL_BUILD_ARG_PREFIX without template surgery. + # Honest-fact #99/#104. + id: bargs + if: steps.df.outputs.skip != 'true' + run: | + set -euo pipefail + BARGS="${OPERATOR_BUILD_ARGS:-}" + if [ -n "${PRECOMPILED_BASE_URL:-}" ] && [ -n "${PRECOMPILED_COMMIT:-}" ]; then + # Build the precompiled-wheel envs via printf so the multi-line + # KEY=VALUE block doesn't dedent the surrounding YAML run block + # (honest-fact #104 — broken-yaml-from-bash-multiline-assignment + # 0-second-failed all 22 forks 2026-06-12). + # + # Emit BOTH ${PREFIX}_PRECOMPILED_WHEEL_COMMIT and + # ${PREFIX}_MERGE_BASE_COMMIT to the same SHA — different + # Dockerfile carries may key off either name (vllm-bot's carry + # uses MERGE_BASE_COMMIT; theoretical future carries may use + # the more-explicit PRECOMPILED_WHEEL_COMMIT). Both work. + # Honest-fact #105. + PREFIX="${PRECOMPILED_PREFIX:-VLLM}" + EXTRA=$(printf '%s_USE_PRECOMPILED=1\n%s_PRECOMPILED_WHEEL_COMMIT=%s\n%s_MERGE_BASE_COMMIT=%s\n%s_PRECOMPILED_WHEEL_VARIANT=%s\n%s_PRECOMPILED_WHEEL_BASE_URL=%s' "$PREFIX" "$PREFIX" "$PRECOMPILED_COMMIT" "$PREFIX" "$PRECOMPILED_COMMIT" "$PREFIX" "$PRECOMPILED_VARIANT" "$PREFIX" "$PRECOMPILED_BASE_URL") + if [ -n "$BARGS" ]; then + BARGS="${BARGS}"$'\n'"${EXTRA}" + else + BARGS="${EXTRA}" + fi + echo " ✓ precompiled-wheel build-args appended (prefix=${PREFIX} commit=${PRECOMPILED_COMMIT} variant=${PRECOMPILED_VARIANT})" + elif [ -n "${PRECOMPILED_BASE_URL:-}" ] && [ -z "${PRECOMPILED_COMMIT:-}" ]; then + # Master switch on, but no resolved commit. Sync-upstream cleared it + # because no wheel published for the current upstream SHA. Build will + # fall back to source compile — degraded perf, but correct binaries + # (vs the silent-ABI-break risk of serving a stale wheel). + echo "::warning::PRECOMPILED_WHEEL_BASE_URL set but PRECOMPILED_WHEEL_COMMIT empty — sync-upstream couldn't find a wheel for current upstream SHA. Falling back to source compile." + fi + { + echo "build_args<<__EOB__" + echo "$BARGS" + echo "__EOB__" + } >> "$GITHUB_OUTPUT" + + - name: 🚀 Build & push + if: steps.df.outputs.skip != 'true' + uses: docker/build-push-action@v6 + with: + context: ${{ vars.BUILD_CONTEXT || '.' }} + file: ${{ steps.df.outputs.path }} + platforms: ${{ vars.PLATFORMS || 'linux/amd64,linux/arm64' }} + # build-args synthesized above. Combines operator-set BUILD_ARGS with + # auto-derived precompiled-wheel envs when PRECOMPILED_WHEEL_BASE_URL + # is set. For forks without it, equivalent to BUILD_ARGS alone. + # Honest-facts #88, #99. + build-args: ${{ steps.bargs.outputs.build_args }} + # BuildKit registry-cache. Reuses prior build's intermediate layers + # across runs to dramatically cut wall time on warm builds (vllm: + # ~30min → ~10-12min target). cache-from is read-only — first run on + # a fresh fork misses everything (cold cache, no penalty). cache-to + # `mode=max` exports ALL intermediate stages (not just the final + # image), so the next run can resume mid-stage. `compression=zstd` + # for faster decompression on cache pulls. + # ignore-error=true on cache-to: if the push to :buildcache fails + # (perms, network), the build still succeeds; just warm cache misses + # on next run. Honest-fact #107. + cache-from: type=registry,ref=${{ steps.imgname.outputs.image }}:buildcache + cache-to: type=registry,ref=${{ steps.imgname.outputs.image }}:buildcache,mode=max,compression=zstd,ignore-error=true + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + provenance: false diff --git a/.github/workflows/fold-on-push.yml b/.github/workflows/fold-on-push.yml new file mode 100644 index 0000000..08a4bbe --- /dev/null +++ b/.github/workflows/fold-on-push.yml @@ -0,0 +1,64 @@ +# UNIVERSAL fold-on-push trigger — byte-identical across every Model B fork. +# +# Purpose: when a push lands on ANY branch other than the default branch or +# intarweb-dev, dispatch sync-upstream.yml to re-fold + re-publish. This is +# the event-driven counterpart to the hourly schedule — covers "new commit +# landed on a PR head branch" and "operator pushed a new branch they're +# about to PR" instantly, without waiting up to 1 hour. +# +# Why every-branch-except-defaults: PR head branches use varied naming +# (feat/*, fix/*, test/*, docs/*, etc). Enumerating patterns means missing +# new ones (build-overlay.yml's pattern misses feat/* and docs/*, MEASURED +# 2026-06-10). Wildcard with explicit exclusions for self-trigger paths is +# the simplest correct shape. +# +# Loop prevention: branches-ignore covers EVERY ref this workflow's downstream +# (sync-upstream.yml) pushes — main, master, intarweb-dev. Pushes from the +# hard-reset/ops-overlay/intarweb-dev-regen logic therefore do NOT re-fire +# this workflow. +# +# Cadence bypass: the dispatch into sync-upstream.yml fires it as a +# workflow_dispatch event. sync-upstream's cadence gate explicitly bypasses +# for non-schedule events (see its ⏱️ Cadence gate step). So a push triggers +# an immediate fold regardless of vars.SYNC_CADENCE_HOURS. +# +# Codified in oss-contributing:ghcr-fork-mirror skill honest-fact #64. + +name: Fold on PR head-branch push + +on: + push: + branches-ignore: + - main + - master + - intarweb-dev + +permissions: + actions: write # required for `gh workflow run` dispatch into sync-upstream + +jobs: + dispatch-sync: + runs-on: ubuntu-latest + steps: + - name: 🤖 Mint org-bot app token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ vars.SYNC_APP_ID }} + private-key: ${{ secrets.SYNC_APP_PRIVATE_KEY }} + + - name: 🌿 Dispatch sync-upstream (full re-fold) + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + echo " push detected on branch: ${{ github.ref_name }}" + echo " triggering full re-fold via sync-upstream.yml" + # The workflow_dispatch event is whitelisted by sync-upstream's + # cadence gate — runs immediately, bypassing SYNC_CADENCE_HOURS. + # --ref points at the workflow file location (default branch), + # NOT the branch that was pushed. sync-upstream itself always + # operates on default branch + folds open PRs regardless of + # which branch triggered this dispatch. + gh workflow run "Sync from upstream + auto-regen intarweb-dev" \ + --repo "${{ github.repository }}" \ + --ref "${{ github.event.repository.default_branch }}" \ No newline at end of file diff --git a/.github/workflows/fork-publish.yml b/.github/workflows/fork-publish.yml new file mode 100644 index 0000000..006a48c --- /dev/null +++ b/.github/workflows/fork-publish.yml @@ -0,0 +1,115 @@ +# UNIVERSAL fork-publish workflow — BYTE-IDENTICAL across every fork. +# +# Mirrors upstream's released versioned tags to ghcr.io/${{ github.repository }} +# (auto-lowercased by docker/metadata-action). Same image namespace as +# build-from-source.yml — ONE concept, no per-fork vanity names. +# +# :latest is owned by build-from-source.yml (built from intarweb-dev). +# This workflow handles VERSIONED tags only (1.2.3, 1.2, 1, etc). +# +# Codified in oss-contributing:ghcr-fork-mirror skill honest-fact #57. + +name: Fork-publish (mirror upstream → GHCR) + +on: + schedule: + - cron: '17 6 * * *' # daily 06:17 UTC — well before sync-upstream's :00 cron + workflow_dispatch: + +permissions: + contents: read + packages: write + +env: + # Per-fork override via repo Variable IMAGE_NAME — used when upstream's published + # image name differs from our repo name (e.g. docker-autoheal repo → autoheal image). + # Falls back to repo name. ONE image per fork, shared with build-from-source.yml. + IMAGE: ghcr.io/${{ github.repository_owner }}/${{ vars.IMAGE_NAME || github.event.repository.name }} + +jobs: + mirror: + runs-on: ubuntu-latest + steps: + - uses: docker/setup-buildx-action@v3 + + - name: 🔑 Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🔍 Discover upstream tags + id: tags + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # GitHub repo (for `gh release list` to discover versioned tags) + UPSTREAM_REPO="joeblack2k/SGM-Helper" + # Docker image source (where imagetools mirrors FROM). Defaults to + # GitHub repo path; override via repo Variable UPSTREAM_IMAGE for forks + # where Docker Hub path differs (e.g. acmesh-official/acme.sh on GitHub + # but neilpang/acme.sh on Docker Hub). + UPSTREAM_IMAGE="${{ vars.UPSTREAM_IMAGE || format('{0}/{1}', 'joeblack2k', 'SGM-Helper') }}" + RELEASES=$(gh release list --repo "$UPSTREAM_REPO" --limit 3 --json tagName \ + --jq '.[].tagName | sub("^v"; "")' 2>/dev/null | tr '\n' ' ' || echo "") + # Fallback: if upstream uses raw git tags without GH Release objects + # (e.g. neo4j/docker-neo4j), gh release list returns empty. Use the + # git tags API and filter for version-shaped tags (must contain at least + # one N.N component; skips purely descriptive words like + # community/latest/enterprise/nightly/dev/main). Honest-fact #86. + if [ -z "$(echo "$RELEASES" | tr -d ' ')" ]; then + RELEASES=$(gh api "repos/$UPSTREAM_REPO/tags?per_page=10" \ + --jq '.[].name' 2>/dev/null \ + | grep -E '[0-9]+\.[0-9]+' \ + | sed 's/^v//' \ + | head -3 \ + | tr '\n' ' ' || echo "") + [ -n "$(echo "$RELEASES" | tr -d ' ')" ] && \ + echo " (release-list was empty; fell back to git tags API)" + fi + MAJORS=$(for r in $RELEASES; do echo "${r%%.*}"; done | sort -u | tr '\n' ' ') + TAGS="$RELEASES $MAJORS" + echo "upstream_image=$UPSTREAM_IMAGE" >> "$GITHUB_OUTPUT" + echo "tags=$TAGS" >> "$GITHUB_OUTPUT" + echo " will mirror: $TAGS" + + - name: 🪞 Mirror tags to GHCR + run: | + set -euo pipefail + UPSTREAM="${{ steps.tags.outputs.upstream_image }}" + # Lowercase the image (Docker requires lowercase repo names; metadata-action + # does this automatically for build-push, we do it manually here for imagetools). + TARGET=$(echo "${IMAGE}" | tr '[:upper:]' '[:lower:]') + NEWEST_VERSIONED="" + for tag in ${{ steps.tags.outputs.tags }}; do + [ -z "$tag" ] && continue + echo "::group::Mirror $UPSTREAM:$tag → $TARGET:$tag" + if docker buildx imagetools inspect "$UPSTREAM:$tag" >/dev/null 2>&1; then + docker buildx imagetools create --tag "$TARGET:$tag" "$UPSTREAM:$tag" + echo " ✓ mirrored $tag" + # Track the first (newest) versioned tag for :latest aliasing below + if [ -z "$NEWEST_VERSIONED" ]; then NEWEST_VERSIONED="$tag"; fi + else + echo " - upstream tag $tag does not exist, skipping" + fi + echo "::endgroup::" + done + + # Also publish :latest pointing to the newest successfully-mirrored version, + # ONLY if this fork is plain-mirror (i.e. has no source-build). Source-built + # forks (vars.IS_SOURCE_BUILT=true) own :latest via build-from-source.yml, + # which builds from intarweb-dev (= upstream + open PRs cherry-picked + ops + # overlay). Letting fork-publish stomp :latest on those forks would clobber + # our patch stack with upstream-pristine, breaking the contract. + # Plain-mirror forks (var unset) keep the historical behavior — fork-publish + # IS the :latest source. Honest-fact #72. + if [ -n "$NEWEST_VERSIONED" ] && [ "${{ vars.IS_SOURCE_BUILT }}" != "true" ]; then + echo "::group::Alias :latest → :$NEWEST_VERSIONED" + docker buildx imagetools create --tag "$TARGET:latest" "$UPSTREAM:$NEWEST_VERSIONED" + echo " ✓ :latest → :$NEWEST_VERSIONED" + echo "::endgroup::" + elif [ "${{ vars.IS_SOURCE_BUILT }}" = "true" ]; then + echo " - skipping :latest alias (IS_SOURCE_BUILT=true; build-from-source.yml owns :latest)" + fi \ No newline at end of file diff --git a/.github/workflows/sync-upstream.yml b/.github/workflows/sync-upstream.yml new file mode 100644 index 0000000..084f327 --- /dev/null +++ b/.github/workflows/sync-upstream.yml @@ -0,0 +1,733 @@ +name: Sync from upstream + auto-regen intarweb-dev + +on: + schedule: + - cron: '0 * * * *' # hourly wakeup. ACTUAL sync work is gated by a runtime cadence + # check (vars.SYNC_CADENCE_HOURS, default=1). GitHub does NOT + # allow expression substitution in on.schedule.cron — confirmed + # via docs + empirical: 22 fleet forks all use literal crons. + # So cadence is a GATE inside the job, not a schedule swap. + # Dead upstreams set SYNC_CADENCE_HOURS=168 (weekly) etc. + workflow_dispatch: + workflow_call: # invoked by fold-on-push.yml when a PR head branch is pushed + # (push event on intarweb-side feat/*, fix/*, test/*, docs/* etc). + # workflow_call bypasses the cadence gate — pushes always rebuild. + +permissions: + actions: write # REQUIRED — the workflow_dispatch publish-trigger step 403s without it (honest-fact #27) + contents: write + pull-requests: read + +# One sync at a time per fork. Without this, a workflow_dispatch and a schedule +# tick can race — both fold cleanly, both attempt force-with-lease push, one +# wins, the loser fails the push with "failed to push some refs" and emits a +# false-failure alert. Concurrency group serializes them; cancel-in-progress +# false keeps the earlier run going (don't waste the cherry-pick work). +# Burned 2026-06-10 during fleet rollout on vllm-jukebox. +concurrency: + group: sync-upstream-${{ github.repository }} + cancel-in-progress: false + +jobs: + sync: + runs-on: ubuntu-latest + steps: + - name: 🤖 Mint org-bot app token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ vars.SYNC_APP_ID }} + private-key: ${{ secrets.SYNC_APP_PRIVATE_KEY }} + + - name: 📥 Checkout fork + uses: actions/checkout@v4 + with: + fetch-depth: 0 + # Org-wide GitHub App token (1h short-lived, auto-rotated). + # Has workflows:write so it CAN push under .github/workflows/ — + # GITHUB_TOKEN can't (honest-fact #26). Replaces per-repo + # SYNC_WORKFLOW_TOKEN PATs campaign-wide (honest-fact #53). + token: ${{ steps.app-token.outputs.token }} + + - name: ⏱️ Cadence gate (skip schedule wakeup if too soon) + id: cadence + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + # Per-fork cadence config: vars.SYNC_CADENCE_HOURS (default 1, i.e. every hourly + # wakeup runs). Increase for dead/slow upstreams: 24=daily, 168=weekly. + # Gate only suppresses SCHEDULE events — workflow_dispatch and workflow_call + # always run regardless of cadence (those are explicit asks). + # + # last_sync is derived live from `gh run list` (no persisted state needed), + # filtered to successful runs of this workflow. The current run is in-progress, + # so it doesn't count toward "last success". + set -euo pipefail + CADENCE="${{ vars.SYNC_CADENCE_HOURS || '1' }}" + EVENT="${{ github.event_name }}" + if [ "$EVENT" != "schedule" ]; then + echo " cadence gate: event=$EVENT (non-schedule) → bypass gate, running" + echo "skip=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + LAST=$(gh run list --workflow sync-upstream.yml --status success --limit 1 \ + --json createdAt --jq '.[0].createdAt // ""') + if [ -z "$LAST" ]; then + echo " cadence gate: no prior successful sync — running" + echo "skip=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + NOW=$(date -u +%s) + LAST_TS=$(date -u -d "$LAST" +%s) + AGE_H=$(( (NOW - LAST_TS) / 3600 )) + if [ "$AGE_H" -ge "$CADENCE" ]; then + echo " cadence gate: last sync ${AGE_H}h ago ≥ cadence ${CADENCE}h → running" + echo "skip=false" >> "$GITHUB_OUTPUT" + else + echo " cadence gate: last sync ${AGE_H}h ago < cadence ${CADENCE}h → skipping wakeup" + echo "skip=true" >> "$GITHUB_OUTPUT" + fi + + - name: ⚙️ Configure git identity + if: steps.cadence.outputs.skip != 'true' + run: | + git config user.email "actions@github.com" + git config user.name "intarweb sync bot" + + - name: 🔗 Add upstream remote + if: steps.cadence.outputs.skip != 'true' + run: git remote add upstream https://github.com/joeblack2k/SGM-Helper.git + + - name: 🔄 Fetch upstream + all fork branches + if: steps.cadence.outputs.skip != 'true' + run: | + git fetch upstream main --tags + git fetch origin --prune + + - name: 🔁 Hard-reset default branch to upstream + re-apply ops overlay + if: steps.cadence.outputs.skip != 'true' + run: | + # Burned 2026-06-09 with intarweb/bifrost: `git rebase upstream/dev` + # PRESERVES commits on our default branch that aren't on upstream — + # so when upstream FORCE-PUSHED and dropped 23 commits (a half-baked + # modelcatalogresolver plugin with incomplete go.sum), those commits + # got TRAPPED on intarweb/bifrost/main forever. Build started failing + # because go.sum referenced packages upstream had since removed. + # Honest-fact #62 (and vllm-bot diagnosis in claude/forker-queue/). + # + # Fix: hard-reset to upstream. Accept upstream's history as authoritative. + # Then re-apply our 4 workflow files + FORK_INFO.md (which legitimately + # live on main so schedules can fire) from the pre-reset working tree. + # This means intarweb/main = upstream/ + exactly N legit ops + # commits — no untracked drift, no force-push survivors. + + git checkout main + + # Save ops overlay BEFORE the reset + mkdir -p /tmp/ops-overlay/.github/workflows + for F in .github/workflows/sync-upstream.yml \ + .github/workflows/build-from-source.yml \ + .github/workflows/build-overlay.yml \ + .github/workflows/fork-publish.yml \ + .github/workflows/update-fork-info.yml \ + .github/workflows/fold-on-push.yml \ + FORK_INFO.md; do + if [ -f "$F" ]; then + mkdir -p "/tmp/ops-overlay/$(dirname $F)" + cp -p "$F" "/tmp/ops-overlay/$F" + fi + done + + # Hard-reset — drops any trapped-from-force-push commits + git reset --hard upstream/main + + # Restore ops overlay + for F in .github/workflows/sync-upstream.yml \ + .github/workflows/build-from-source.yml \ + .github/workflows/build-overlay.yml \ + .github/workflows/fork-publish.yml \ + .github/workflows/update-fork-info.yml \ + .github/workflows/fold-on-push.yml \ + FORK_INFO.md; do + if [ -f "/tmp/ops-overlay/$F" ]; then + mkdir -p "$(dirname $F)" + cp -p "/tmp/ops-overlay/$F" "$F" + fi + done + + if ! git status --porcelain -- .github/workflows FORK_INFO.md | grep -q .; then + echo " no ops overlay to re-apply (none existed pre-reset)" + else + git add -A .github/workflows + [ -f FORK_INFO.md ] && git add FORK_INFO.md + git commit -m "ci: re-apply ops overlay after hard-reset to upstream/main" + fi + + git push --force-with-lease origin main + + - name: 🔍 Discover open PRs from intarweb to upstream + id: prs + if: steps.cadence.outputs.skip != 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Use GitHub's search/issues API instead of paginating /pulls?state=open. + # The old approach paginated EVERY open PR on upstream then jq-filtered + # client-side: on NousResearch/hermes-agent (MEASURED 13,295 open PRs + # 2026-06-10), that's 133 API calls per sync just to find our 1-2. + # search/issues with `author:terafin` gives the answer in 1 call. + # + # Caveats verified MEASURED on intarweb/vllm 2026-06-10 (6/6 parity vs + # paginate): + # - All intarweb-authored upstream PRs are by `terafin` today. If a + # co-contributor opens one, broaden the author qualifier OR fall + # back to a per-PR head.repo.owner.login == intarweb verification. + # - search/issues doesn't return head.ref; we fetch it per result + # (N small extra calls, where N = our PR count, typically 1-6). + # - The result set is far below search API's 1000-result cap. + # + # Defensive: after the search, re-verify each PR's head.repo.owner.login + # before adding. Drops anything that shouldn't be cherry-picked even if + # search returns extras. + # SEMANTIC CONTRACT: this filter is `author:` (PRs THAT user authored, + # from anywhere), NOT `head:` (PRs from any author whose branch lives + # in that org). GitHub's search API has no head-org qualifier; only REST + # /pulls supports head=user:branch (literal, no wildcard). These two + # filters coincide TODAY because every intarweb PR is both terafin-authored + # AND from an intarweb branch, but they diverge the first time a + # co-contributor opens a PR from an intarweb branch under a different + # account — that PR would be silently MISSED by this discovery. + # + # Defense: the per-PR re-check below enforces head.repo.owner.login == + # ${{ github.repository_owner }}, so the search can never accept a PR from + # the wrong head repo. It just can't see beyond the author qualifier. + # + # vars.SYNC_PR_AUTHOR override: defaults to `terafin` (current sole author). + # If a co-contributor joins, set this var to a space-separated list and the + # query expands. Empty value → fall back to defensive: full /pulls scan + # (slow, accurate). Honest-fact #65. + PR_AUTHORS="${{ vars.SYNC_PR_AUTHOR || 'terafin' }}" + : > /tmp/pr-nums.txt + if [ -z "$PR_AUTHORS" ]; then + # Fallback: paginate /pulls and filter client-side. Slow on big upstreams + # but provably complete. + gh api --paginate "repos/joeblack2k/SGM-Helper/pulls?state=open&per_page=100" \ + --jq '.[] | select(.head.repo.owner.login == "${{ github.repository_owner }}") | .number' \ + >> /tmp/pr-nums.txt + else + for AUTH in $PR_AUTHORS; do + gh api "search/issues?q=repo:joeblack2k/SGM-Helper+is:pr+is:open+author:${AUTH}&per_page=100" \ + --jq '.items[].number' >> /tmp/pr-nums.txt + done + sort -u /tmp/pr-nums.txt -o /tmp/pr-nums.txt + fi + : > /tmp/prs.txt + while read -r num; do + [ -z "$num" ] && continue + META=$(gh api "repos/joeblack2k/SGM-Helper/pulls/$num" \ + --jq '"\(.head.repo.owner.login)|\(.head.ref)|\(.title)"' 2>/dev/null) + OWN="${META%%|*}" + REST="${META#*|}" + REF="${REST%%|*}" + TITLE="${REST#*|}" + if [ "$OWN" != "${{ github.repository_owner }}" ]; then + echo " - skipping PR #$num — head.repo.owner=$OWN (expected ${{ github.repository_owner }})" + continue + fi + echo "$num $REF $TITLE" >> /tmp/prs.txt + done < /tmp/pr-nums.txt + sort -n -o /tmp/prs.txt /tmp/prs.txt + + if [ ! -s /tmp/prs.txt ]; then + echo " No open PRs from ${{ github.repository_owner }} to upstream — intarweb-dev will == main" + else + echo " Open PRs to cherry-pick onto intarweb-dev:" + sed 's/^/ /' /tmp/prs.txt + fi + + - name: 🚨 Warn on intarweb-dev workflow drift (about to be wiped) + # The regen step below does `git checkout -B intarweb-dev main` + # which OVERWRITES intarweb-dev with main's tree, then re-cherry-picks + # the open-PR set. Any workflow edits made directly to intarweb-dev that + # aren't in main AND aren't in an open PR's cherry-pick set get SILENTLY + # WIPED here. + # + # Live burn 2026-06-11 on intarweb/vllm: prior agent committed + # `f1bdf3e33` (TORCH_CUDA_ARCH_LIST trim + 720min timeout) to + # intarweb-dev directly. The next sync regen wiped it without alarm; six + # subsequent build attempts ran with the stale 3-arch list, all + # cancelled at the 360-min wall. Honest-fact #97. + # + # The wipe IS the feature (uniformity contract). The silence is the + # bug. This step makes it visible BEFORE the wipe runs: yellow + # ::warning:: in the Actions UI + diff in the run log. Operator sees + # "I edited intarweb-dev directly and the sync is about to undo it" + # and can re-edit on main (where it survives via the ops-overlay + # save/restore above). + if: steps.cadence.outputs.skip != 'true' + run: | + git fetch origin intarweb-dev 2>/dev/null || { + echo " (no remote intarweb-dev yet — first sync, nothing to wipe)" + exit 0 + } + ANY_DRIFT=0 + for F in .github/workflows/sync-upstream.yml \ + .github/workflows/build-from-source.yml \ + .github/workflows/build-overlay.yml \ + .github/workflows/fork-publish.yml \ + .github/workflows/update-fork-info.yml \ + .github/workflows/fold-on-push.yml \ + FORK_INFO.md; do + # Get both sides; skip cleanly if either lacks the file. + MAIN_BLOB=$(git show "HEAD:$F" 2>/dev/null || echo "") + DEV_BLOB=$(git show "origin/intarweb-dev:$F" 2>/dev/null || echo "") + if [ -z "$MAIN_BLOB" ] && [ -z "$DEV_BLOB" ]; then + continue + fi + if [ "$MAIN_BLOB" = "$DEV_BLOB" ]; then + continue + fi + ANY_DRIFT=1 + echo "::warning::ops-overlay restore will wipe intarweb-dev's $F — differs from main." + echo " If you intended this drift to persist across syncs, commit it to MAIN." + echo " Diff (intarweb-dev → main, head -20):" + diff <(echo "$DEV_BLOB") <(echo "$MAIN_BLOB") 2>/dev/null | head -20 | sed 's/^/ /' + echo "" + done + if [ "$ANY_DRIFT" = "0" ]; then + echo " ✓ no intarweb-dev ops-overlay drift detected" + fi + + - name: 🌿 Regenerate intarweb-dev = main + open-PR cherry-picks + id: regen + if: steps.cadence.outputs.skip != 'true' + # FORK_CARRIED_COMMITS step (below the PR loop) calls `gh api` to + # check carrier-PR / upstream-PR state. GITHUB_TOKEN read scope is + # sufficient — we only read PR state, never write here. + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git checkout -B intarweb-dev main + while read num branch title; do + [ -z "$num" ] && continue + echo "::group::PR #$num — $branch ($title)" + git fetch origin "$branch" || { echo " ✗ failed to fetch origin/$branch"; exit 1; } + # ── Precompiled-wheel guard: SKIP PRs that touch compiled-code files ── + # On precompiled-wheel forks (PRECOMPILED_WHEEL_BASE_URL set) the + # runtime image is an UPSTREAM-prebuilt wheel; csrc/CUDA/kernels + # binaries cannot be relinked at runtime. Carrying a PR that edits + # compiled code would layer Python on a stale .so and silently + # produce wrong binaries — and the post-cherry-pick carry-drift + # guard (below) would then FAIL-CLOSE the whole sync, blocking ALL + # carries (including unrelated Python-only ones). This EARLY skip + # drops only the offending PR via `gh pr files`, BEFORE cherry-pick, + # so the Python carries still land. Logs each skip loudly. + # + # The extension set + path prefixes are kept BYTE-IDENTICAL to the + # carry-drift guard (search ".(cu|cuh|...)") so a PR that passes + # this skip can never trip the guard, and vice-versa. + # + # Belt-and-suspenders: PRECOMPILED_WHEEL_SKIP_PRS (space-separated + # PR numbers) force-skips a PR even if its file query is flaky. + # General rule above is primary; the explicit list is a fallback. + # + # Gated on vars.PRECOMPILED_WHEEL_BASE_URL → byte-identical no-op on + # the 21 source-build forks where that Variable is unset. + if [ -n "${{ vars.PRECOMPILED_WHEEL_BASE_URL }}" ]; then + FORCE_SKIP=" ${{ vars.PRECOMPILED_WHEEL_SKIP_PRS || '' }} " + if [ "$FORCE_SKIP" != " " ] && printf '%s' "$FORCE_SKIP" | grep -qw "$num"; then + echo "::warning::skipping PR #$num — listed in PRECOMPILED_WHEEL_SKIP_PRS (operator force-skip)" + echo "::endgroup::" + continue + fi + PR_FILES=$(gh api --paginate "repos/joeblack2k/SGM-Helper/pulls/$num/files?per_page=100" \ + --jq '.[].filename' 2>/dev/null || true) + COMPILED=$(printf '%s\n' "$PR_FILES" \ + | grep -iE '\.(cu|cuh|cpp|cxx|cc|c|hpp|hxx|hh|h|pyx|pxd|s|S)$|^(csrc|kernels)/' || true) + if [ -n "$COMPILED" ]; then + echo "::warning::skipping PR #$num — touches compiled code, cannot carry into precompiled-wheel fork: $(printf '%s' "$COMPILED" | tr '\n' ' ')" + echo "$COMPILED" | sed 's/^/ /' + echo "::endgroup::" + continue + fi + fi + # Range "intarweb-dev..origin/$branch" = commits on branch not yet on intarweb-dev. + # As we apply more PRs, intarweb-dev grows; later branches only contribute their NEW commits. + # If a PR was merged upstream, its commits are now on main (and thus intarweb-dev), so this is empty. + COMMITS=$(git log --reverse --format=%H "intarweb-dev..origin/$branch") + if [ -z "$COMMITS" ]; then + echo " - no unique commits — already on intarweb-dev (likely merged upstream); skipping" + else + for c in $COMMITS; do + # -X theirs: when a PR's content is a superset of an earlier PR's content + # (common when PRs are stacked), take the cherry-picked version. Honest-fact #54. + # -m 1: tolerate upstream-merge-commits (honest-fact #45). + # --allow-empty --keep-redundant-commits: tolerate the no-op case where the + # commit's changes are already present (e.g. via another PR's superset). + if ! git cherry-pick -X theirs -m 1 --allow-empty --keep-redundant-commits "$c" 2>/dev/null; then + # FAIL-CLOSED (changed 2026-06-10, replacing Heal C silent-drop). + # The previous behavior — `git cherry-pick --abort; break; continue with next PR` + # silently dropped the conflicting PR and kept publishing :latest WITHOUT it. + # Consumers pinning :latest had no signal that PR #N was being silently + # excluded. New behavior: FAIL THE JOB. Subsequent steps (force-push, + # build dispatch) do not run. The previous :latest stays in place, serving. + # Operator notification: ::error:: annotation in the run, the run shows + # red in the actions tab, and portfolio-audit's daily digest catches + # sync_conclusion=="failure" (portfolio-audit.yml line 147 "FAILED_SYNC"). + # Operator must manually rebase the offending PR head branch and force-push + # it before the next sync will succeed. + echo "::error::FOLD FAILED — PR #$num cherry-pick conflict on $c survived -X theirs. intarweb-dev WILL NOT be advanced; :latest remains at last-good. Rebase $branch onto upstream/main + push to recover." + git cherry-pick --abort || true + exit 1 + fi + done + echo " ✓ applied $(echo "$COMMITS" | wc -w) commits from PR #$num" + fi + echo "::endgroup::" + done < /tmp/prs.txt + + # FORK_CARRIED_COMMITS — re-apply long-lived fork-carried patches + # that aren't bound to an OPEN PR's lifecycle. Honest-fact #98. + # + # Why this exists: the open-PR cherry-pick loop above only re-applies + # commits that live on terafin-authored OPEN PRs. If the carrier PR + # closes (unmerged OR superseded), the cherry-pick stops happening + # next sync and the fork SILENTLY REGRESSES to upstream/main's + # behavior. That's catastrophic for patches we depend on at runtime + # (e.g. cumem accounting fix for KV-OOM-free wakes). + # + # Format (per-fork repo Variable FORK_CARRIED_COMMITS, multi-line, + # one record per line, pipe-separated): + # ||| + # + # Fields: + # commit-sha short or long SHA on the fork's working tree + # upstream-pr-to-watch PR # in upstream that, when MERGED, makes + # this patch redundant. Step emits a yellow + # WARNING when merged (operator should drop + # the entry); harmless empty cherry-pick if + # left in (upstream's version is already in + # the hard-reset main, so cherry-pick is no-op). + # carrier-pr optional. PR # under our fork that hosts + # this rebase. If carrier is CLOSED-UNMERGED, + # this step FAILS LOUDLY rather than silently + # dropping the patch — UNLESS disposition is + # skip-on-conflict (see below). + # disposition optional. Default `fail-closed` (current + # behavior — used for OUR PRs that we own and + # are responsible for rebasing). Set to + # `skip-on-conflict` for SPECULATIVE third- + # party carries (PRs from other authors that + # we ride for early bug coverage). Behavior on + # conflict OR carrier-closed-unmerged: + # - fail-closed: emit ::error::, abort sync, + # :latest stays at last-good. Operator + # must intervene. + # - skip-on-conflict: emit ::warning::, drop + # the carry this run, continue regen with + # remaining carries. Wheel-resolver still + # runs. Builds advance with reduced + # coverage. Re-attempts next sync if the + # upstream PR rebases. Honest-fact #100. + # + # When to use skip-on-conflict: a PR from a third-party author you + # don't have rebase-rights on. Silently dropping is the SAME outcome + # as "we never picked it up" — strictly safer than blocking the whole + # pipeline on a carry we don't own. + # + # Examples (intarweb/vllm at 2026-06-12): + # # Our PRs — fail-closed (default, current behavior) + # c9427de62|37111|45208 # cumem OOM fix; we own #45208 + # 88af8e6b0|45097 # /health/decode; we own #45097 + # + # # Third-party speculative — skip-on-conflict + # 54a52c948|45268|45326|skip-on-conflict # waynehacking8's hang fix + # f61febf66|45268|45363|skip-on-conflict # AlejandroParedesLT's drain fix + # + # Comments (lines starting with #) and blank lines are skipped. + if [ -n "${{ vars.FORK_CARRIED_COMMITS }}" ]; then + echo "::group::FORK_CARRIED_COMMITS — re-applying long-lived patches" + echo "${{ vars.FORK_CARRIED_COMMITS }}" > /tmp/fork-carried.txt + while IFS='|' read -r CSHA UPSTREAM_PR CARRIER_PR DISPOSITION; do + CSHA="$(echo "$CSHA" | xargs)" + UPSTREAM_PR="$(echo "$UPSTREAM_PR" | xargs)" + CARRIER_PR="$(echo "$CARRIER_PR" | xargs)" + DISPOSITION="$(echo "${DISPOSITION:-}" | xargs)" + # Skip blanks + comments + [ -z "$CSHA" ] && continue + case "$CSHA" in \#*) continue ;; esac + # Default disposition: fail-closed (preserves prior behavior for + # all existing 3-column entries). Only `skip-on-conflict` enables + # the new degraded-but-non-blocking path. + [ -z "$DISPOSITION" ] && DISPOSITION="fail-closed" + + # Check carrier-PR state (if specified). On closed-unmerged: + # - fail-closed → fail loudly (silent-regression class the + # mechanism exists to prevent for OWNED carries). + # - skip-on-conflict → warn + drop (third-party PRs we don't + # own; same outcome as never picking it up; safer than + # blocking the pipeline). Operator should remove the entry + # when convenient. + # + # On open-and-rebased: AUTO-REFRESH the cherry-pick SHA to the + # carrier PR's CURRENT head. The most common stale-SHA failure + # mode in this whole subsystem is "carrier PR rebased + # upstream-side, pinned SHA in FORK_CARRIED_COMMITS now points + # at a commit that's been GC'd or whose parent moved, cherry- + # pick fails or applies the wrong content" — burned 2026-06-12 + # on intarweb/vllm with PR #45453 (head moved 85129770b → + # 24020531f after a rebase, pinned SHA collided with the open- + # PR auto-cherry-pick loop). Resolving live each sync makes + # this self-healing for owned carriers and surfaces it for + # third-party. fetch-then-resolve so the local repo has the + # commit; fall back to pinned SHA if API query fails (network + # blip or PR temporarily unreachable). + if [ -n "$CARRIER_PR" ]; then + CARRIER_META=$(gh api "repos/joeblack2k/SGM-Helper/pulls/$CARRIER_PR" \ + --jq '"\(.state)|\(.merged)|\(.head.sha)"' 2>/dev/null || echo "unknown||") + C_STATE="${CARRIER_META%%|*}" + C_REST="${CARRIER_META#*|}" + C_MERGED="${C_REST%%|*}" + C_HEAD="${C_REST#*|}" + if [ "$C_STATE" = "closed" ] && [ "$C_MERGED" != "true" ]; then + if [ "$DISPOSITION" = "skip-on-conflict" ]; then + echo "::warning::FORK_CARRIED_COMMITS: speculative carry $CSHA — carrier PR #$CARRIER_PR is CLOSED-UNMERGED. Skipping this run (disposition=skip-on-conflict). Remove the entry from FORK_CARRIED_COMMITS when convenient." + continue + fi + echo "::error::FORK_CARRIED_COMMITS: carrier PR #$CARRIER_PR for commit $CSHA is CLOSED-UNMERGED. Without intervention this patch would silently drop from the next image. ABORTING sync to surface the regression." + echo "::error:: Fix: (a) re-open #$CARRIER_PR and rebase, OR (b) file a new carrier PR and update FORK_CARRIED_COMMITS to point at it, OR (c) if the patch is genuinely no-longer-needed, remove the entire entry from FORK_CARRIED_COMMITS, OR (d) if this is a speculative third-party carry you don't own, change disposition to skip-on-conflict." + exit 1 + fi + # Auto-refresh SHA when carrier is open + rebased. Normalizes + # both pinned + resolved to long-form for prefix-safe compare + # (pinned is often abbreviated, resolved is always 40-char). + if [ "$C_STATE" = "open" ] && [ -n "$C_HEAD" ]; then + CSHA_PREFIX_LEN=${#CSHA} + C_HEAD_PREFIX="${C_HEAD:0:$CSHA_PREFIX_LEN}" + if [ "$C_HEAD_PREFIX" != "$CSHA" ]; then + echo "::notice::FORK_CARRIED_COMMITS: carrier PR #$CARRIER_PR head moved $CSHA → $C_HEAD (auto-refresh). Update FORK_CARRIED_COMMITS at convenience to suppress this notice." + CSHA="$C_HEAD" + # Fetch the new head so cherry-pick can find it (the + # fork's git history may not have it from the earlier + # `git fetch origin --prune`, since carrier branches in + # upstream live on upstream-side). Try upstream remote + # first; if it's an intarweb-side branch, origin already + # has it from the earlier fetch. + git fetch upstream "$C_HEAD" 2>/dev/null || \ + git fetch origin "$C_HEAD" 2>/dev/null || \ + echo "::warning::FORK_CARRIED_COMMITS: failed to fetch resolved head $C_HEAD for carrier PR #$CARRIER_PR — cherry-pick may fail. Retrying with pinned SHA $CSHA as fallback would mask drift; not falling back." + fi + fi + fi + + # Check upstream-PR state (always required) — warn on merged + # (operator should drop the entry; the cherry-pick will be an + # empty no-op below). + UPSTREAM_STATE=$(gh api "repos/joeblack2k/SGM-Helper/pulls/$UPSTREAM_PR" \ + --jq '"\(.state)|\(.merged)"' 2>/dev/null || echo "unknown|") + U_STATE="${UPSTREAM_STATE%%|*}" + U_MERGED="${UPSTREAM_STATE#*|}" + if [ "$U_MERGED" = "true" ]; then + echo "::warning::FORK_CARRIED_COMMITS: upstream PR #$UPSTREAM_PR has MERGED. Commit $CSHA is now redundant — please remove this entry from FORK_CARRIED_COMMITS. Cherry-pick below will no-op." + fi + + # Apply (or empty no-op if already present from main or open-PR loop). + if git cherry-pick -X theirs -m 1 --allow-empty --keep-redundant-commits "$CSHA" 2>/dev/null; then + echo " ✓ applied carried commit $CSHA (upstream-pr=$UPSTREAM_PR carrier=$CARRIER_PR disposition=$DISPOSITION)" + else + git cherry-pick --abort || true + if [ "$DISPOSITION" = "skip-on-conflict" ]; then + # Speculative third-party carry rotted (e.g. another upstream + # PR merged that touches the same files). Drop this run; sync + # continues with remaining carries; wheel-resolver runs as + # normal; :latest can still advance for unrelated reasons. + # Re-attempts next sync (author may rebase upstream-side). + echo "::warning::FORK_CARRIED_COMMITS: speculative carry $CSHA (upstream PR #$UPSTREAM_PR) conflicts with current main this cycle — skipping. Re-attempts on next sync after author rebases." + continue + fi + # default: fail-closed. Same posture as the open-PR loop. + echo "::error::FORK_CARRIED_COMMITS: cherry-pick conflict on $CSHA survived -X theirs. intarweb-dev WILL NOT be advanced; :latest remains at last-good. Operator must rebase the carrier (#$CARRIER_PR) or update FORK_CARRIED_COMMITS to a current SHA. (For speculative third-party carries you don't own, set disposition=skip-on-conflict to degrade gracefully instead.)" + exit 1 + fi + done < /tmp/fork-carried.txt + echo "::endgroup::" + fi + + # Did intarweb-dev's TREE actually change vs the existing remote? + # Compare trees (content), not SHAs (which differ each run because + # cherry-pick rewrites committer timestamps). + if git rev-parse origin/intarweb-dev^{tree} >/dev/null 2>&1; then + OLD_TREE=$(git rev-parse origin/intarweb-dev^{tree}) + NEW_TREE=$(git rev-parse intarweb-dev^{tree}) + if [ "$OLD_TREE" = "$NEW_TREE" ]; then + echo " - intarweb-dev tree unchanged; skipping push + publish trigger" + echo "changed=false" >> "$GITHUB_OUTPUT" + else + echo " - intarweb-dev tree changed; will push + trigger publish" + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + else + echo " - intarweb-dev did not exist on origin; will push + trigger publish" + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: 🛡️ Carry-drift guard (precompiled-wheel forks only) + # Forks that opt into the upstream-prebuilt-wheel build path (via + # PRECOMPILED_WHEEL_BASE_URL Variable) MUST keep their carry stack + # 100% non-compiled. If a cherry-pick or FORK_CARRIED_COMMIT introduces + # a compiled-code file, the fork's Python carries would be layering + # on stale upstream-compiled-`.so`s and produce wrong binaries silently. + # This guard fails the sync loud at the moment of regression rather + # than at runtime. No-op for forks without the master Variable set. + # + # Detected file extensions: .cu, .cuh, .cpp, .cxx, .cc, .c, .hpp, .hxx, + # .hh, .h, .pyx, .pxd, .s, .S (broadened from the initial .cu/.cpp/.h/.cuh + # after architect-review #1 flagged the gap). --diff-filter=AM excludes + # pure deletions and renames so an upstream `.cu` rename doesn't false-fire. + # + # build-system hint files (setup.py / CMakeLists.txt / pyproject.toml / + # *.cmake) emit a WARNING but don't fail — a Python-only carry that + # touches setup.py to register a `build_ext` hook IS a precompiled-path + # invalidator, but most setup.py edits are harmless. + # + # Escape valve: set CARRY_DRIFT_ALLOW=true on the fork to bypass with a + # loud warning (use for one-shot testing of a compiled-code cherry-pick + # under operator supervision; CLEAR THE VARIABLE after). + # Honest-fact #99. + if: steps.cadence.outputs.skip != 'true' && success() && vars.PRECOMPILED_WHEEL_BASE_URL != '' + run: | + set -euo pipefail + DRIFT=$(git diff --name-only --diff-filter=AM "upstream/main..intarweb-dev" \ + | grep -iE '\.(cu|cuh|cpp|cxx|cc|c|hpp|hxx|hh|h|pyx|pxd|s|S)$' || true) + BUILD_HINTS=$(git diff --name-only --diff-filter=AM "upstream/main..intarweb-dev" \ + | grep -E '(^|/)setup\.py$|(^|/)pyproject\.toml$|CMakeLists\.txt$|\.cmake$' || true) + if [ -n "$BUILD_HINTS" ]; then + echo "::warning::carry stack touches build-system files (setup.py / pyproject.toml / CMake) — review whether they re-enable C++/CUDA compilation:" + echo "$BUILD_HINTS" | sed 's/^/ /' + fi + if [ -n "$DRIFT" ]; then + if [ "${{ vars.CARRY_DRIFT_ALLOW || '' }}" = "true" ]; then + echo "::warning::CARRY-DRIFT detected but CARRY_DRIFT_ALLOW=true is set — proceeding under operator override. CLEAR the variable after this sync:" + echo "$DRIFT" | sed 's/^/ /' + else + # Single retry after 30s — most drift trips are races where a + # carry-bot (vllm-bot et al.) is force-pushing a rebased branch + # mid-sync, so the cherry-picked tree briefly disagrees with the + # upstream HEAD we just fetched. Wait, re-fetch, re-check. + # Honest-fact #112: real-corruption drift persists; race-drift + # resolves on next read. Single retry kills the noise without + # losing the safety net. + echo "::warning::CARRY-DRIFT detected — race-condition possible (carry-bot may be mid-rebase). Retrying in 30s before failing loud:" + echo "$DRIFT" | sed 's/^/ /' + sleep 30 + git fetch upstream "main" --quiet + DRIFT=$(git diff --name-only --diff-filter=AM "upstream/main..intarweb-dev" \ + | grep -iE '\.(cu|cuh|cpp|cxx|cc|c|hpp|hxx|hh|h|pyx|pxd|s|S)$' || true) + if [ -n "$DRIFT" ]; then + echo "::error::CARRY-DRIFT persisted across 30s retry — this is real, not a race. precompiled-wheel forks require 100% non-compiled carries — these compiled-code files appeared in intarweb-dev:" + echo "$DRIFT" | sed 's/^/ /' + echo "::error::Recovery options (pick one):" + echo "::error:: (a) revert the offending commit and rebase the carry" + echo "::error:: (b) clear PRECOMPILED_WHEEL_BASE_URL to fall back to source build (slow but correct)" + echo "::error:: (c) set CARRY_DRIFT_ALLOW=true on this fork to bypass once under operator override (clear after)" + echo "::error:: (d) test on a feature branch first by clearing PRECOMPILED_WHEEL_BASE_URL temporarily on a fork clone" + exit 1 + else + echo " ✓ carry-drift guard cleared on retry — was a transient race" + fi + fi + else + echo " ✓ carry-drift guard clean — no compiled-code carries" + fi + + - name: 🎯 Resolve precompiled wheel SHA + variant (precompiled-wheel forks only) + # Writes PRECOMPILED_WHEEL_COMMIT = current upstream HEAD SHA so the + # build workflow's bargs synth step picks it up. Does NOT pre-verify + # the wheel URL — wheels.vllm.ai (and presumably similar wheel CDNs) + # 403 on directory probes, making sync-time availability checks + # impossible without knowing the exact .whl filename convention. The + # real safety net is the fork-side Dockerfile carry's hard fail-loud + # contract (vllm-bot hardening req #3): when VLLM_USE_PRECOMPILED=1 + # is set but the wheel fetch fails at build time, the build exits + # non-zero immediately rather than silently falling through to a + # multi-hour source compile. Honest-fact #99/#103. + # + # Updates the PRECOMPILED_WHEEL_VARIANT only if the operator's + # PRECOMPILED_WHEEL_VARIANT_CHAIN suggests a different first-choice + # (no — actually we just trust the operator's VARIANT setting). + # Variant fallback would require build-time re-attempt which the + # Dockerfile carry can implement; the resolver writes single VARIANT. + # + # Freeze mode: if PRECOMPILED_WHEEL_COMMIT_FROZEN=true, the resolver + # skips writes entirely (operator pinned for testing or rollback). + # + # gh variable set wrapped survivable: if the SYNC_APP token lacks + # `Variables: Write` permission, the call 403s but the sync continues + # with a warning rather than hard-failing the entire fork's sync. + # Operator grants the app permission, next sync succeeds. + # + # No-op for forks without the master Variable set. Honest-fact #99/#103. + if: steps.cadence.outputs.skip != 'true' && success() && vars.PRECOMPILED_WHEEL_BASE_URL != '' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + set -uo pipefail # NOT -e — we handle variable-write failures survivably below + SHA=$(git rev-parse upstream/main) + CUR_COMMIT="${{ vars.PRECOMPILED_WHEEL_COMMIT }}" + FROZEN="${{ vars.PRECOMPILED_WHEEL_COMMIT_FROZEN || '' }}" + CUR_VARIANT="${{ vars.PRECOMPILED_WHEEL_VARIANT }}" + + if [ "$FROZEN" = "true" ]; then + echo "::warning::PRECOMPILED_WHEEL_COMMIT_FROZEN=true — skipping wheel-SHA resolution. Pinned to commit=$CUR_COMMIT variant=$CUR_VARIANT." + exit 0 + fi + + # Helper: survivable variable-set. Echoes the run URL into the audit log. + set_var() { + local name="$1" value="$2" + if gh variable set "$name" -b "$value" 2>&1; then + echo " ✓ set $name=$value (audit: $RUN_URL)" + return 0 + else + echo "::warning::failed to write $name (likely SYNC_APP lacks 'Variables: Write' permission — grant via app settings to enable wheel-SHA auto-management). Continuing with prior value." + return 1 + fi + } + + echo " setting PRECOMPILED_WHEEL_COMMIT=$SHA (variant=$CUR_VARIANT — build-time will fail-loud if wheel not actually published per fork-side Dockerfile carry contract)" + if [ "$CUR_COMMIT" != "$SHA" ]; then + set_var PRECOMPILED_WHEEL_COMMIT "$SHA" || true + echo " → PRECOMPILED_WHEEL_COMMIT: $CUR_COMMIT → $SHA" + else + echo " → PRECOMPILED_WHEEL_COMMIT already at $SHA, no change" + fi + + - name: 📤 Force-push intarweb-dev + # Gated on (a) cadence-not-skipped, (b) regen succeeded (implicit via default + # `success()`), (c) tree actually changed. If the cherry-pick loop hit a + # conflict, regen exits 1 → this step does not run → prior intarweb-dev stays + # at last-good → :latest is not republished → fail-closed contract holds. + if: steps.cadence.outputs.skip != 'true' && success() && steps.regen.outputs.changed == 'true' + run: git push --force-with-lease origin intarweb-dev + + - name: 🚀 Trigger publish on intarweb-dev + if: steps.cadence.outputs.skip != 'true' && success() && steps.regen.outputs.changed == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Required because GitHub blocks the normal push→workflow cascade + # when the push is authored by GITHUB_TOKEN (anti-loop safety rule). + # workflow_dispatch IS allowed to fire from GITHUB_TOKEN, so we + # explicitly dispatch the publish workflow here. UNIVERSAL name + # — every Model B fork's build workflow is named exactly this + # via the build-from-source.yml template (honest-fact #52). + gh workflow run "Build from source → GHCR" --repo ${{ github.repository }} --ref intarweb-dev || true + + - name: ✅ Show final state + if: steps.cadence.outputs.skip != 'true' && success() + run: | + echo " main HEAD: $(git log main --oneline -1)" + echo " intarweb-dev HEAD: $(git log intarweb-dev --oneline -1)" + echo " intarweb-dev commits ahead of upstream/main:" + git log --oneline upstream/main..intarweb-dev | sed 's/^/ /' || true diff --git a/.github/workflows/update-fork-info.yml b/.github/workflows/update-fork-info.yml new file mode 100644 index 0000000..810851f --- /dev/null +++ b/.github/workflows/update-fork-info.yml @@ -0,0 +1,150 @@ +name: Update FORK_INFO.md carried-patches table + +# Regenerates the auto-managed "carried patches" section in FORK_INFO.md +# from the current set of open intarweb→upstream PRs. Keeps the doc in +# sync with what :latest actually carries — when a PR merges upstream, +# its row disappears within a cron tick. +# +# Triggers on completion of the sync workflow (so we know intarweb-dev +# was just regenerated) + manual dispatch. Idempotent: only pushes if +# FORK_INFO.md actually changed. +# +# Codified in oss-contributing:ghcr-fork-mirror skill v1.19.0 honest- +# fact #49. + +on: + workflow_run: + workflows: ["Sync from upstream + auto-regen intarweb-dev"] + types: [completed] + workflow_dispatch: + +permissions: + contents: write + pull-requests: read + +env: + UPSTREAM_OWNER: joeblack2k + UPSTREAM_REPO: SGM-Helper + UPSTREAM_BRANCH: main + DEPLOY_BRANCH: intarweb-dev + +jobs: + update-fork-info: + runs-on: ubuntu-latest + steps: + - name: 📥 Checkout default branch + uses: actions/checkout@v4 + with: + token: ${{ secrets.SYNC_WORKFLOW_TOKEN || secrets.GITHUB_TOKEN }} + ref: main + + - name: 🔧 Configure git + run: | + git config user.email "terafin@users.noreply.github.com" + git config user.name "intarweb fork-info bot" + + - name: 📋 Discover open intarweb→upstream PRs + id: prs + env: + GH_TOKEN: ${{ secrets.SYNC_WORKFLOW_TOKEN || secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + gh pr list \ + --repo "${UPSTREAM_OWNER}/${UPSTREAM_REPO}" \ + --state open \ + --limit 100 \ + --json number,title,headRepositoryOwner \ + --jq '[.[] | select(.headRepositoryOwner.login == "${{ github.repository_owner }}")] | sort_by(.number) | .[] | "\(.number)\t\(.title)"' \ + > /tmp/open-prs.tsv + echo "Open intarweb PRs:" + cat /tmp/open-prs.tsv | sed 's/^/ /' || echo " (none)" + + - name: 📝 Regenerate carried-patches table + run: | + set -euo pipefail + + # Build the auto-managed block. Tab-separated input (num\ttitle). + # Title is wrapped in `_…_` if list is empty so the markdown + # renders gracefully. + { + echo "" + echo "" + if [ -s /tmp/open-prs.tsv ]; then + echo "| PR | Title |" + echo "|---|---|" + while IFS=$'\t' read -r num title; do + [ -z "$num" ] && continue + # Escape any pipe in the title to keep table cells valid + safe_title="${title//|/\\|}" + echo "| [#${num}](https://github.com/${UPSTREAM_OWNER}/${UPSTREAM_REPO}/pull/${num}) | ${safe_title} |" + done < /tmp/open-prs.tsv + else + echo "_No open intarweb→upstream PRs — \`${DEPLOY_BRANCH}\` matches \`upstream/${UPSTREAM_BRANCH}\` exactly._" + fi + echo "" + echo "" + } > /tmp/patches-block.md + + # Replace markers in-place if present, else append a new section + if [ ! -f FORK_INFO.md ]; then + # Seed a minimal skeleton via printf — heredoc-with-column-0 + # content inside a `run: |` YAML block-scalar terminates the + # block (honest-fact #68). printf '%s\n' keeps every line + # inside the run-block's indentation. Single-quoted heredoc + # also failed to expand ${UPSTREAM_REPO}/${DEPLOY_BRANCH}/ + # ${UPSTREAM_BRANCH}, writing them as literal text — fixed + # here as a side-effect of the printf migration. (Burned + # 2026-06-11 → 2026-06-13: the auto-heal push cycle was + # firing this seed path on 20 marker-less forks and failing + # with "workflow file issue", silently leaving FORK_INFO.md + # missing for 2 days.) + printf '%s\n' \ + "# Fork tracking — intarweb/${UPSTREAM_REPO}" \ + "" \ + "(Auto-generated section below tracks what \`${DEPLOY_BRANCH}\` currently" \ + "carries vs \`upstream/${UPSTREAM_BRANCH}\`. Everything outside the" \ + "markers is hand-maintained.)" \ + "" \ + "## Local patches we carry on \`${DEPLOY_BRANCH}\` (vs \`upstream/${UPSTREAM_BRANCH}\`)" \ + "" \ + > FORK_INFO.md + cat /tmp/patches-block.md >> FORK_INFO.md + elif grep -q 'intarweb:patches-start' FORK_INFO.md; then + awk -v block="/tmp/patches-block.md" ' + // { + in_block = 0 + next + } + !in_block { print } + ' FORK_INFO.md > /tmp/fork-info-new.md + mv /tmp/fork-info-new.md FORK_INFO.md + else + # No markers yet — append a new auto-managed section at the end + { + echo "" + echo "## Carried patches (auto-managed)" + echo "" + cat /tmp/patches-block.md + } >> FORK_INFO.md + fi + + - name: 💾 Commit + push if changed + run: | + set -euo pipefail + git add FORK_INFO.md + # Stage first — git diff --quiet on an untracked file returns 0 ("no + # tracked changes") and would silently skip a brand-new FORK_INFO.md. + # --cached catches both cases. + if git diff --cached --quiet -- FORK_INFO.md; then + echo "✓ FORK_INFO.md unchanged — skipping push" + exit 0 + fi + git commit -m "docs(fork-info): auto-update carried-patches table" + git push origin HEAD:refs/heads/main + echo "✓ pushed updated FORK_INFO.md" diff --git a/FORK_INFO.md b/FORK_INFO.md new file mode 100644 index 0000000..9bc5288 --- /dev/null +++ b/FORK_INFO.md @@ -0,0 +1,61 @@ +# Fork tracking — intarweb/SGM-Helper + +This is a soft fork of [`joeblack2k/SGM-Helper`](https://github.com/joeblack2k/SGM-Helper) — the Rust workspace that builds `sgm-mister-helper`, `sgm-steamdeck-helper`, `sgm-windows-helper`, and the Anbernic/GameCube variants. **Source-only fork** (no GHCR pipeline — the helpers are device binaries, not container images). + +> All forks we manage with this pattern live under the [`intarweb`](https://github.com/intarweb) GitHub org. See the `ghcr-fork-mirror` skill (the patch-carrying mechanics still apply even though there's no Docker image). + +## Why this fork exists + +We carry a single patch while it's in review upstream: + +| PR | Description | Status | +|---|---|---| +| **[joeblack2k/SGM-Helper#1](https://github.com/joeblack2k/SGM-Helper/pull/1)** — `feat/recover-stale-sync-lock` | `_reconnect_loop` would die after a reboot mid-sync because the helper bailed on any pre-existing `/sync.lock` regardless of whether the recorded PID was still alive. Adds liveness-check + 6h-age fallback + single recovery retry. | Open | + +Live repro on a MiSTer 2026-06-06: `pid=19199` started_at 13:29:25Z, reboot 13:33Z, watcher restarted, ran 3 cycles fine, died on cycle 209 with the stale-lock bail; PID 19199 no longer existed. + +## Branch model + +| Branch | Purpose | +|---|---| +| `main` | Tracks upstream `main` | +| `feat/recover-stale-sync-lock` | The PR branch — applies the fix to all three helpers (mister, steamdeck, windows) | + +## Files we patch + +- `helpers/mister/Cargo.toml` — add `libc` (cfg(unix)) +- `helpers/mister/src/syncer.rs` — new helpers + `SyncLock::acquire` rewrite + 7 unit tests +- `helpers/steamdeck/Cargo.toml` — same dep add +- `helpers/steamdeck/src/syncer.rs` — same fix + tests +- `helpers/windows/Cargo.toml` — same dep add +- `helpers/windows/src/syncer.rs` — same fix + tests (Unix-specific tests cfg-gated) + +## How to consume locally + +The intarweb fork doesn't publish prebuilt binaries — clone, build, install: + +```bash +git clone https://github.com/intarweb/SGM-Helper -b feat/recover-stale-sync-lock /tmp/sgm +cd /tmp/sgm + +# MiSTer (armv7) — cross-compile from the build host +cargo build -p sgm-mister-helper --release --target armv7-unknown-linux-gnueabihf +scp target/armv7-unknown-linux-gnueabihf/release/sgm-mister-helper root@mister:/media/fat/ + +# Steam Deck (x86_64-linux) +cargo build -p sgm-steamdeck-helper --release +``` + +When PR #1 lands upstream, this fork dissolves and devices return to upstream's published binaries. + +## Carried patches (auto-managed) + + + +| PR | Title | +|---|---| +| [#11](https://github.com/joeblack2k/SGM-Helper/pull/11) | fix(scanner): libretro/RetroDECK compatibility batch (PS1 + Sega + RTC/MPK/CPK + TG-16) | +| [#12](https://github.com/joeblack2k/SGM-Helper/pull/12) | helper: syncer reliability — stale-lock recovery + watch-loop survival | +| [#13](https://github.com/joeblack2k/SGM-Helper/pull/13) | feat(config): add `secure` flag to opt into HTTPS in base_url | + + diff --git a/helpers/mister/Cargo.toml b/helpers/mister/Cargo.toml index fd18c67..08f621d 100644 --- a/helpers/mister/Cargo.toml +++ b/helpers/mister/Cargo.toml @@ -21,6 +21,13 @@ sha2 = "0.10" time = { version = "0.3", features = ["formatting", "macros"] } walkdir = "2.5" +# Unix-only: PID liveness check (kill(pid, 0)) for stale-lockfile recovery +# in syncer::pid_alive. Windows builds get a conservative "treat as alive" +# default and rely on the cross-platform 6h-age fallback to recover stuck +# locks; no Windows-specific dep is needed. +[target.'cfg(unix)'.dependencies] +libc = "0.2" + [dev-dependencies] assert_cmd = "2.0" httpmock = "0.8" diff --git a/helpers/mister/src/config.rs b/helpers/mister/src/config.rs index 36573ab..fd10910 100644 --- a/helpers/mister/src/config.rs +++ b/helpers/mister/src/config.rs @@ -19,6 +19,7 @@ pub struct ConfigOverrides { pub force_upload: Option, pub dry_run: Option, pub route_prefix: Option, + pub secure: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -34,6 +35,7 @@ pub struct AppConfig { pub force_upload: bool, pub dry_run: bool, pub route_prefix: String, + pub secure: bool, pub binary_dir: PathBuf, pub config_path: PathBuf, } @@ -70,7 +72,8 @@ impl LoadedConfig { impl AppConfig { pub fn base_url(&self) -> String { - format!("http://{}:{}", self.url, self.port) + let scheme = if self.secure { "https" } else { "http" }; + format!("{}://{}:{}", scheme, self.url, self.port) } pub fn resolved_root(&self) -> Result { @@ -164,6 +167,7 @@ impl AppConfig { "ROUTE_PREFIX", "", )?); + let secure = choose_bool(overrides.secure, env_values, ini_values, "SECURE", false)?; if watch_interval == 0 { bail!("WATCH_INTERVAL moet >= 1 zijn"); @@ -181,6 +185,7 @@ impl AppConfig { force_upload, dry_run, route_prefix, + secure, binary_dir, config_path: config_path.to_path_buf(), }) @@ -498,6 +503,34 @@ SAVE_PATH="/home/snes9x/save" assert_eq!(cfg.base_url(), "http://192.168.1.9:9096"); } + #[test] + fn base_url_uses_https_when_secure_true() { + let cfg = test_sources( + ConfigOverrides::default(), + &[ + ("URL", "saves.example.com"), + ("PORT", "443"), + ("SECURE", "true"), + ], + &[], + ) + .unwrap(); + assert_eq!(cfg.base_url(), "https://saves.example.com:443"); + assert!(cfg.secure); + } + + #[test] + fn base_url_remains_http_when_secure_unset_for_backwards_compat() { + let cfg = test_sources( + ConfigOverrides::default(), + &[("URL", "192.168.1.9"), ("PORT", "9096")], + &[], + ) + .unwrap(); + assert_eq!(cfg.base_url(), "http://192.168.1.9:9096"); + assert!(!cfg.secure); + } + #[test] fn precedence_cli_over_env_over_ini_over_default() { let cfg = test_sources( diff --git a/helpers/mister/src/lib.rs b/helpers/mister/src/lib.rs index 6aba864..025cb1a 100644 --- a/helpers/mister/src/lib.rs +++ b/helpers/mister/src/lib.rs @@ -63,6 +63,7 @@ pub fn run() -> Result<()> { force_upload: None, dry_run: None, route_prefix: cli.route_prefix.clone(), + secure: None, }; let loaded = LoadedConfig::load(cli.config.clone(), &global_overrides)?; diff --git a/helpers/mister/src/scanner.rs b/helpers/mister/src/scanner.rs index 562059a..da41a06 100644 --- a/helpers/mister/src/scanner.rs +++ b/helpers/mister/src/scanner.rs @@ -657,19 +657,24 @@ pub fn classify_supported_save( &save_lower, &[ "master system", + "mastersystem", "/sms/", "\\sms\\", "game gear", + "gamegear", "/gg/", "\\gg\\", "sega 32x", "sega-32x", "sega32x", + "sega32xjp", + "sega32xna", "/32x/", "\\32x\\", "mega cd", "mega-cd", "megacd", + "megacdjp", "sega cd", "sega-cd", "segacd", @@ -684,11 +689,13 @@ pub fn classify_supported_save( "genesis", "mega drive", "megadrive", + "megadrivejp", "/md/", "\\md\\", "/gen/", "\\gen\\", "saturn", + "saturnjp", "dreamcast", "sega", ], @@ -724,6 +731,54 @@ pub fn classify_supported_save( return None; } + if contains_any( + &save_lower, + &[ + "pc engine", + "pcengine", + "pc-engine", + "pcenginecd", + "pc-engine-cd", + "turbografx", + "turbo grafx", + "turbo-grafx", + "tgfx16", + "/tg16/", + "\\tg16\\", + "/tg-cd/", + "\\tg-cd\\", + "/tg_cd/", + "\\tg_cd\\", + "supergrafx", + "super grafx", + "/sgx/", + "\\sgx\\", + "/pce/", + "\\pce\\", + "/pce-cd/", + "\\pce-cd\\", + "/pcecd/", + "\\pcecd\\", + "nec - pc engine", + "nec - turbografx", + "mednafen-pce", + "beetle pce", + "beetle-pce", + ], + ) { + let slug = infer_nec_slug(&save_lower); + if is_plausible_save_for_system(&save_ext, save_size, slug) { + return classify_if_valid( + save_path, + &save_ext, + save_size, + slug, + format!("path hint nec + .{} ({} bytes)", save_ext, save_size), + ); + } + return None; + } + if contains_any( &save_lower, &[ @@ -918,10 +973,13 @@ fn infer_sega_slug(haystack: &str) -> &'static str { ) { return "dreamcast"; } - if contains_any(haystack, &["master system", "/sms/", "\\sms\\"]) { + if contains_any( + haystack, + &["master system", "mastersystem", "/sms/", "\\sms\\"], + ) { return "master-system"; } - if contains_any(haystack, &["game gear", "/gg/", "\\gg\\"]) { + if contains_any(haystack, &["game gear", "gamegear", "/gg/", "\\gg\\"]) { return "game-gear"; } if contains_any( @@ -970,6 +1028,14 @@ fn infer_sega_slug(haystack: &str) -> &'static str { "genesis" } +fn infer_nec_slug(_haystack: &str) -> &'static str { + // MiSTer's TurboGrafx-16 core handles both HuCard ROMs and CD-ROM games + // under one core (TGFX16). Saves land in /media/fat/saves/TGFX16/ for + // both. Future work could split tgfx16-cd if MiSTer's core ever + // splits, or if a downstream consumer wants the distinction. + "tgfx16" +} + fn infer_sony_slug(haystack: &str) -> &'static str { if contains_any( haystack, @@ -1074,6 +1140,7 @@ fn is_plausible_save_for_system(ext: &str, size: u64, slug: &str) -> bool { "saturn" => matches!(ext, "sav" | "srm" | "ram" | "bkr"), "dreamcast" => matches!(ext, "bin" | "vms" | "dci"), "neogeo" => matches!(ext, "sav" | "srm" | "ram"), + "tgfx16" => matches!(ext, "sav" | "srm" | "ram" | "bkr" | "brm"), "wii" => ext == "bin", "psx" => matches!( ext, @@ -1096,10 +1163,19 @@ fn is_plausible_save_for_system(ext: &str, size: u64, slug: &str) -> bool { size, 512 | 1024 | 2048 | 4096 | 8192 | 16384 | 32768 | 65536 | 131072 ), - "gameboy" => matches!( - size, - 512 | 1024 | 2048 | 4096 | 8192 | 16384 | 32768 | 65536 - ), + "gameboy" => match ext { + // .rtc carries the GBC real-time clock state (Pokemon + // Crystal/Gold/Silver, Harvest Moon GBC). It's structurally + // different from SRAM — typically 8 bytes (MiSTer Gameboy_MiSTer + // canonical layout) up to ~64 bytes for variants with extra + // metadata. Don't apply the SRAM size profile to it. + "rtc" => (1..=64).contains(&size), + // Authoritative GB/GBC SRAM sizes (save-format spec): 512B, 2K, 8K, + // 32K, 64K, 128K (MBC5 max). Drops non-real power-of-two values and + // ADDS 131072 — the old ceiling silently rejected legit 128K MBC5 + // saves. + _ => matches!(size, 512 | 2048 | 8192 | 32768 | 65536 | 131072), + }, "gba" => matches!(size, 512 | 8192 | 32768 | 65536 | 131072), "n64" => match ext { "eep" => size == 512 || size == 2048, @@ -1144,6 +1220,13 @@ fn is_plausible_save_for_system(ext: &str, size: u64, slug: &str) -> bool { // a power-of-two size. size == 0x12000 || (size.is_power_of_two() && (512..=2_097_152).contains(&size)) } + "tgfx16" => { + // TurboGrafx-16 / PC Engine HuCard battery saves are typically 2KB SRAM. + // PCE CD / TGFX-CD adds backup RAM (BRAM) — 2KB stock, 8KB extended, + // or up to 32KB on some accessories. Accept the common power-of-two + // sizes in that range. + matches!(size, 2048 | 4096 | 8192 | 16384 | 32768) + } "wii" => (WII_DATA_BIN_FILE_HEADER_OFFSET + 0x80..=MAX_SAVE_BYTES as usize) .contains(&(size as usize)), "psx" => { @@ -1991,10 +2074,14 @@ fn validate_ps1_raw_memcard(bytes: &[u8]) -> bool { } } - let trailing_start = 63 * PS1_FRAME_SIZE; - let trailing_end = trailing_start + PS1_FRAME_SIZE; - let trailing = &header[trailing_start..trailing_end]; - trailing.starts_with(b"MC") && frame_checksum_ok(trailing) + // Frame 63 ("write test sector") historically duplicates frame 0's MC magic + // on real-hardware-formatted cards. Many emulators (SwanStation / Beetle + // PSX / Duckstation libretro) do NOT populate it that way — they leave it + // zero-filled or write game data through it. Requiring "MC" here was + // rejecting every RetroArch/SwanStation memcard as if it were noise. + // Frame 0 magic + frames 1-15 checksums are sufficient evidence this is + // a real PS1 memcard. + true } fn frame_checksum_ok(frame: &[u8]) -> bool { @@ -2766,4 +2853,190 @@ mod tests { fn write_be_u32(payload: &mut [u8], offset: usize, value: u32) { payload[offset..offset + 4].copy_from_slice(&value.to_be_bytes()); } + #[test] + fn ps1_memcard_accepted_when_trailing_frame_lacks_mc_magic() { + // Build a valid memcard, then overwrite frame 63 with the "looks like + // game data continuation" bytes seen in real SwanStation/RetroArch + // memcards (e.g. Tom Clancy's Rainbow Six - Lone Wolf saved on Steam + // Deck). Frame 63 historically held the "write test sector" with MC + // magic; emulators don't always honor that — they leave it zeroed + // OR write actual game state through it. The validator must accept + // these. Issue #N (filed alongside this commit). + let mut bytes = build_valid_ps1_memcard(); + let trailing_start = 63 * PS1_FRAME_SIZE; + // Wipe to zero, then write the observed bytes from the real-world + // SwanStation Rainbow Six memcard at this offset. + for b in bytes[trailing_start..trailing_start + PS1_FRAME_SIZE].iter_mut() { + *b = 0; + } + bytes[trailing_start..trailing_start + 8] + .copy_from_slice(&[0x03, 0x00, 0x00, 0x00, 0x80, 0x0C, 0x5A, 0x27]); + assert!( + validate_ps1_raw_memcard(&bytes), + "memcard with non-MC trailing frame should be accepted" + ); + } + + #[test] + fn ps1_memcard_accepted_when_trailing_frame_is_zero_filled() { + // Another common emulator pattern: leave frame 63 entirely zeroed. + let mut bytes = build_valid_ps1_memcard(); + let trailing_start = 63 * PS1_FRAME_SIZE; + for b in bytes[trailing_start..trailing_start + PS1_FRAME_SIZE].iter_mut() { + *b = 0; + } + assert!( + validate_ps1_raw_memcard(&bytes), + "memcard with zero-filled trailing frame should be accepted" + ); + } + + #[test] + fn ps1_memcard_still_rejected_when_size_wrong() { + let bytes = vec![0u8; PS1_MEMCARD_SIZE - 1]; + assert!( + !validate_ps1_raw_memcard(&bytes), + "memcard with wrong size must still be rejected" + ); + } + + #[test] + fn ps1_memcard_still_rejected_when_frame0_magic_absent() { + let mut bytes = build_valid_ps1_memcard(); + bytes[0] = b'X'; + bytes[1] = b'Y'; + assert!( + !validate_ps1_raw_memcard(&bytes), + "memcard without MC magic at frame 0 must still be rejected" + ); + } + + #[test] + fn ps1_memcard_still_rejected_when_directory_frame_checksum_broken() { + let mut bytes = build_valid_ps1_memcard(); + // Corrupt directory frame 3 (intentionally NOT touching frame 0 or + // the trailing frame — proves we still require directory frames to + // be well-formed even though we relaxed the trailing check). + let frame3_start = 3 * PS1_FRAME_SIZE; + let checksum_byte_index = frame3_start + PS1_FRAME_SIZE - 1; + bytes[checksum_byte_index] = bytes[checksum_byte_index].wrapping_add(1); + assert!( + !validate_ps1_raw_memcard(&bytes), + "memcard with broken directory frame checksum must still be rejected" + ); + } + + #[test] + fn ps1_memcard_regression_strict_format_still_accepted() { + // Regression guard: the original strict-format memcard (with MC at + // frame 63) must continue to pass — we only RELAXED the trailing + // check, not removed support for it. + let bytes = build_valid_ps1_memcard(); + assert!( + validate_ps1_raw_memcard(&bytes), + "strict-format memcard with MC at trailing frame should still be accepted" + ); + } + + #[test] + fn classify_recognizes_retrodeck_sega_single_word_dirs() { + // RetroDECK uses single-word lowercase directory names (gamegear, + // mastersystem, megadrive). The Sega classifier historically had + // some of these (megadrive, megacd, sega32x) but was missing + // gamegear and mastersystem. Issue #5 (filed alongside this commit). + // Tests cover both the fix and the previously-broken paths. + let tmp = tempfile::tempdir().unwrap(); + + // Build a 64K non-printable SRAM payload (battery save shape). 0xA5 + // is binary-looking — using a printable byte like 0x42 ('B') would + // trip looks_plain_text() and bail at the early-return guard in + // classify_supported_save before any path-hint dispatch runs. + let payload = vec![0xA5u8; 65536]; + + for (subdir, expected_slug) in &[ + ("gamegear", "game-gear"), // FIX — was broken + ("mastersystem", "master-system"), // FIX — was broken + ("megadrive", "genesis"), // regression guard + ("megacd", "sega-cd"), // regression guard + ("sega32x", "sega-32x"), // regression guard + ("genesis", "genesis"), // regression guard (English name) + ("megacdjp", "sega-cd"), // FIX — JP variant + // saturnjp omitted: Saturn classifier requires a valid backup-RAM + // header signature in passes_binary_validation, so a flat 0xA5 + // payload is rejected before slug assignment. Worth its own test + // with a synthesized Saturn Bk-header fixture. + ("sega32xjp", "sega-32x"), // FIX — JP variant + ("sega32xna", "sega-32x"), // FIX — NA variant + ("megadrivejp", "genesis"), // FIX — JP variant + ] { + let dir = tmp.path().join("retrodeck/saves").join(subdir); + fs::create_dir_all(&dir).unwrap(); + let save = dir.join("Test Game.srm"); + fs::write(&save, &payload).unwrap(); + + let classification = classify_supported_save(&save, None); + assert!( + classification.is_some(), + "expected /{subdir}/ to classify (would skip in production with 'outside allowed console families' if None)", + ); + let got = classification.unwrap().system_slug; + assert_eq!( + got, *expected_slug, + "/{subdir}/ classified as {got}, expected {expected_slug}", + ); + } + } + + #[test] + fn classify_recognizes_pcengine_tgfx16_paths() { + // The TurboGrafx-16 / PC Engine block was missing entirely from + // classify_supported_save before this PR. Saves under any of the + // common dir conventions (RetroDECK / standalone / libretro) were + // skipped at "outside allowed console families." Issue (filed + // alongside this commit). + let tmp = tempfile::tempdir().unwrap(); + let payload = vec![0xA5u8; 2048]; // 2KB — canonical HuCard SRAM size; 0xA5 avoids looks_plain_text + + for subdir in &[ + "pcengine", // RetroDECK convention + "pcenginecd", // RetroDECK CD variant + "pc-engine", // hyphen variant + "pc-engine-cd", // hyphen CD variant + "tg16", // older convention + "tg-cd", // older CD convention + "tg_cd", // underscore CD variant + "supergrafx", // SuperGrafx + "super grafx", // SuperGrafx with space + "sgx", // SuperGrafx slug + "pce", // short PC Engine slug + "pce-cd", // short PCE CD + "pcecd", // short PCE CD packed + "PC Engine", // standalone with space + "TurboGrafx", // standalone English name + "turbo grafx", // English name with space + "turbo-grafx", // English name with hyphen + "tgfx16", // MiSTer slug + "nec - pc engine", // NoIntro DAT naming + "nec - turbografx", // NoIntro DAT naming variant + "mednafen-pce", // libretro core name + "beetle pce", // alternate libretro name + "beetle-pce", // hyphen variant + ] { + let dir = tmp.path().join("retrodeck/saves").join(subdir); + fs::create_dir_all(&dir).unwrap(); + let save = dir.join("Test Game.srm"); + fs::write(&save, &payload).unwrap(); + + let classification = classify_supported_save(&save, None); + assert!( + classification.is_some(), + "expected /{subdir}/ to classify (would skip in production with 'outside allowed console families' if None)", + ); + let got = classification.unwrap().system_slug; + assert_eq!( + got, "tgfx16", + "/{subdir}/ classified as {got}, expected tgfx16", + ); + } + } } diff --git a/helpers/mister/src/syncer.rs b/helpers/mister/src/syncer.rs index bc4d078..c81ae6b 100644 --- a/helpers/mister/src/syncer.rs +++ b/helpers/mister/src/syncer.rs @@ -47,35 +47,174 @@ struct ProcessedEntry { entry: SyncedEntry, } +#[derive(Debug)] struct SyncLock { path: PathBuf, } +/// Maximum lockfile age before we treat it as stale even if we can't verify +/// PID liveness (Windows fallback, or any platform where the recorded PID +/// can't be parsed). A normal sync finishes in seconds; six hours is a wide +/// safety margin so we don't yank a long-running sync, while still recovering +/// from a stuck lockfile within the same day. +const STALE_LOCK_MAX_AGE_SECS: u64 = 6 * 60 * 60; + +/// Verdict on whether an existing lockfile is "stale" (safe to take over) or +/// "active" (must bail). Carries a human-readable reason for the warning log. +#[derive(Debug, PartialEq, Eq)] +enum StaleVerdict { + /// Recorded PID is no longer alive on this system. + DeadPid(u32), + /// Lockfile content couldn't be parsed as our format. + Malformed, + /// Lockfile is older than `STALE_LOCK_MAX_AGE_SECS` (cross-platform fallback). + TooOld { age_secs: u64 }, + /// Lockfile looks like an active sync — caller must bail. + Active, +} + +/// Parse the `pid=` line out of a lockfile body. +fn parse_lock_pid(contents: &str) -> Option { + for line in contents.lines() { + if let Some(rest) = line.strip_prefix("pid=") + && let Ok(pid) = rest.trim().parse::() + { + return Some(pid); + } + } + None +} + +/// Best-effort check of whether `pid` corresponds to a running process on +/// this host. On Unix this uses `kill(pid, 0)` which doesn't deliver a +/// signal — it only validates that the kernel has a process with that pid +/// reachable to us. On Windows we return `true` ("conservatively alive") +/// and rely on the age-based fallback to recover from stuck locks. +fn pid_alive(pid: u32) -> bool { + #[cfg(unix)] + { + // SAFETY: `libc::kill` with signum 0 performs only existence-check; + // it has no side effects on the target process. + let r = unsafe { libc::kill(pid as libc::pid_t, 0) }; + if r == 0 { + // Process exists and we have permission to signal it. + return true; + } + // Distinguish "no such process" (ESRCH) from "permission denied" (EPERM). + // EPERM means the pid IS alive but is owned by a different uid — treat + // as alive so we don't take over another user's lock. + let errno = std::io::Error::last_os_error().raw_os_error(); + errno != Some(libc::ESRCH) + } + #[cfg(not(unix))] + { + let _ = pid; + true + } +} + +/// Decide whether an existing `sync.lock` at `path` (with the given on-disk +/// contents) should be treated as stale. See `StaleVerdict` for the cases. +fn classify_existing_lock(path: &Path, contents: &str) -> StaleVerdict { + // First, try the PID-based check: parse the pid line and ask the OS + // whether it's still alive. If parse fails the file is malformed — most + // likely an interrupted write — also stale. + match parse_lock_pid(contents) { + Some(pid) => { + if !pid_alive(pid) { + return StaleVerdict::DeadPid(pid); + } + } + None => return StaleVerdict::Malformed, + } + + // PID-alive path: as a cross-platform fallback (and a backstop for + // exotic cases like PID reuse on long-running hosts), also treat any + // lockfile older than the configured threshold as stale. + if let Ok(meta) = fs::metadata(path) + && let Ok(modified) = meta.modified() + && let Ok(age) = modified.elapsed() + { + let age_secs = age.as_secs(); + if age_secs > STALE_LOCK_MAX_AGE_SECS { + return StaleVerdict::TooOld { age_secs }; + } + } + + StaleVerdict::Active +} + impl SyncLock { fn acquire(state_dir: &Path) -> Result { let path = state_dir.join("sync.lock"); - let lock_content = format!( - "pid={}\\nstarted_at={}\\n", - std::process::id(), - now_rfc3339() - ); - match fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&path) - { - Ok(mut file) => { - use std::io::Write; - file.write_all(lock_content.as_bytes()) - .with_context(|| format!("kan lockfile niet schrijven: {}", path.display()))?; - Ok(Self { path }) - } - Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => { - anyhow::bail!("sync is al actief (lockfile bestaat): {}", path.display()); + let lock_content = format!("pid={}\nstarted_at={}\n", std::process::id(), now_rfc3339()); + // Single retry: if the first create_new fails because the lockfile + // already exists, inspect it. If we can prove it's stale (dead PID, + // malformed, or too old), remove it and try once more. We never loop + // — exactly one recovery attempt per acquire call, so a genuinely + // contended sync still bails after a deterministic single retry. + for attempt in 0..2 { + match fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&path) + { + Ok(mut file) => { + use std::io::Write; + file.write_all(lock_content.as_bytes()).with_context(|| { + format!("kan lockfile niet schrijven: {}", path.display()) + })?; + return Ok(Self { path }); + } + Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => { + if attempt == 1 { + // We already tried to recover once on this call — bail. + anyhow::bail!("sync is al actief (lockfile bestaat): {}", path.display()); + } + let existing = fs::read_to_string(&path).unwrap_or_default(); + match classify_existing_lock(&path, &existing) { + StaleVerdict::DeadPid(pid) => { + eprintln!( + "waarschuwing: oude sync.lock van pid={} verwijderd (proces draait niet meer): {}", + pid, + path.display() + ); + let _ = fs::remove_file(&path); + // loop continues to retry create_new + } + StaleVerdict::Malformed => { + eprintln!( + "waarschuwing: oude sync.lock met onleesbare inhoud verwijderd: {}", + path.display() + ); + let _ = fs::remove_file(&path); + } + StaleVerdict::TooOld { age_secs } => { + eprintln!( + "waarschuwing: oude sync.lock ({}s oud, drempel {}s) verwijderd: {}", + age_secs, + STALE_LOCK_MAX_AGE_SECS, + path.display() + ); + let _ = fs::remove_file(&path); + } + StaleVerdict::Active => { + anyhow::bail!( + "sync is al actief (lockfile bestaat): {}", + path.display() + ); + } + } + } + Err(err) => { + return Err(err).with_context(|| { + format!("kan sync lockfile niet maken: {}", path.display()) + }); + } } - Err(err) => Err(err) - .with_context(|| format!("kan sync lockfile niet maken: {}", path.display())), } + // Unreachable: the loop either returns Ok, bails, or returns Err. + unreachable!("SyncLock::acquire loop should always exit via return or bail"); } } @@ -502,6 +641,14 @@ fn restore_single_cloud_save( ); if existing_local_save_is_valid(&provisional_path, &system_slug) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': a valid local save is already present at {}", + system_slug, + cloud_save.display_name(), + provisional_path.display() + ); + } return Ok(()); } @@ -515,6 +662,14 @@ fn restore_single_cloud_save( if native_target_path != provisional_path && existing_local_save_is_valid(&native_target_path, &system_slug) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': a valid local save is already present at the native target {}", + system_slug, + cloud_save.display_name(), + native_target_path.display() + ); + } return Ok(()); } if !target_parent_allowed(&provisional_path, save_roots, create_missing_system_dirs) { @@ -532,10 +687,26 @@ fn restore_single_cloud_save( let provisional_key = provisional_path.to_string_lossy().to_string(); if restored_targets.contains(&provisional_key) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': target {} was already restored earlier in this scan", + system_slug, + cloud_save.display_name(), + provisional_key + ); + } return Ok(()); } let native_target_key = native_target_path.to_string_lossy().to_string(); if native_target_key != provisional_key && restored_targets.contains(&native_target_key) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': native target {} was already restored earlier in this scan", + system_slug, + cloud_save.display_name(), + native_target_key + ); + } return Ok(()); } @@ -579,6 +750,14 @@ fn restore_single_cloud_save( ); if existing_local_save_is_valid(&target_path, &system_slug) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': a valid local save is already present at the final target {}", + system_slug, + cloud_save.display_name(), + target_path.display() + ); + } return Ok(()); } if !target_parent_allowed(&target_path, save_roots, create_missing_system_dirs) { @@ -596,6 +775,14 @@ fn restore_single_cloud_save( let target_key = target_path.to_string_lossy().to_string(); if restored_targets.contains(&target_key) { + if verbose { + eprintln!( + "Skipping cloud {} save '{}': final target {} was already restored earlier in this scan", + system_slug, + cloud_save.display_name(), + target_key + ); + } return Ok(()); } @@ -2412,7 +2599,27 @@ fn save_selection_key(save_path: &Path) -> String { { return format!("wii:{}", title_code.to_ascii_lowercase()); } - filename_stem(save_path).to_ascii_lowercase() + let stem = filename_stem(save_path).to_ascii_lowercase(); + // Additive save extensions are NOT alternative formats of the main save — + // they carry independent state that must NOT be deduplicated against the + // primary battery save. Examples: + // .rtc — GBC real-time clock state (Pokemon Crystal/Gold/Silver, + // Harvest Moon GBC, etc). The .srm is the SRAM; the .rtc is + // the day/night clock. Dropping .rtc loses time-based events + // like Pokemon Crystal evolutions and the in-game day cycle. + // .mpk / .cpk — N64 controller pak / memory pak. Independent of the + // main .eep / .fla / .sra battery save. + // Give these their own selection key so select_preferred_save_per_stem + // doesn't pick one variant and silently drop the others. + if let Some(ext) = save_path + .extension() + .and_then(|value| value.to_str()) + .map(|value| value.to_ascii_lowercase()) + && matches!(ext.as_str(), "rtc" | "mpk" | "cpk") + { + return format!("{}:{}", stem, ext); + } + stem } fn select_preferred_save_per_stem( @@ -3550,4 +3757,188 @@ mod tests { "dc-line:dreamcast:retroarch:a2" ); } + #[test] + fn save_selection_key_separates_additive_extensions() { + // Pokemon Crystal saves both .srm (battery SRAM) and .rtc (real-time + // clock state). They are NOT alternative formats of each other — + // dropping .rtc loses time-based events (Pokemon evolutions, day/night + // cycle). They must get DIFFERENT selection keys so they don't dedup + // against each other in select_preferred_save_per_stem. + let srm = PathBuf::from( + "/run/media/x/saves/gbc/Pokemon - Crystal Version (USA, Europe) (Rev 1).srm", + ); + let rtc = PathBuf::from( + "/run/media/x/saves/gbc/Pokemon - Crystal Version (USA, Europe) (Rev 1).rtc", + ); + let srm_key = save_selection_key(&srm); + let rtc_key = save_selection_key(&rtc); + assert_ne!( + srm_key, rtc_key, + ".srm and .rtc must have distinct selection keys (issue: Pokemon Crystal clock state was being dropped)" + ); + // Sanity: same stem still produces deterministic per-extension keys. + assert_eq!( + rtc_key, + "pokemon - crystal version (usa, europe) (rev 1):rtc" + ); + } + + #[test] + fn save_selection_key_separates_n64_controller_pak_from_battery_save() { + // N64 .mpk / .cpk = controller pak (separate save device, used for + // games like Mario Kart 64 ghost data, F-Zero X custom courses). + // .eep / .fla / .sra = main battery save. Must not dedup. + let eep = PathBuf::from("/saves/n64/Mario Kart 64 (USA).eep"); + let mpk = PathBuf::from("/saves/n64/Mario Kart 64 (USA).mpk"); + let cpk = PathBuf::from("/saves/n64/Mario Kart 64 (USA).cpk"); + let eep_key = save_selection_key(&eep); + let mpk_key = save_selection_key(&mpk); + let cpk_key = save_selection_key(&cpk); + assert_ne!(eep_key, mpk_key, ".eep and .mpk must have distinct keys"); + assert_ne!(eep_key, cpk_key, ".eep and .cpk must have distinct keys"); + assert_ne!( + mpk_key, cpk_key, + ".mpk and .cpk must have distinct keys (different controller-pak slots)" + ); + } + + #[test] + fn save_selection_key_dedups_real_alternative_formats() { + // Regression guard: ALTERNATIVE formats of the same save (e.g. a + // user with both .sav and .srm versions of the same game) MUST + // still collide so dedup picks the preferred one. + let sav = PathBuf::from("/saves/snes/Super Mario World.sav"); + let srm = PathBuf::from("/saves/snes/Super Mario World.srm"); + assert_eq!( + save_selection_key(&sav), + save_selection_key(&srm), + "true alternative formats (.sav vs .srm of same game) must still dedup" + ); + } + + // ------------------------------------------------------------------ + // Stale sync.lock recovery tests + // + // Covers the regression where the helper bailed on any pre-existing + // sync.lock file, regardless of whether the recorded PID was still + // alive — which left the watcher dead after a reboot mid-sync. + // ------------------------------------------------------------------ + + #[test] + fn parse_lock_pid_reads_pid_line() { + let body = "pid=19199\nstarted_at=2026-06-06T13:29:25Z\n"; + assert_eq!(parse_lock_pid(body), Some(19199)); + } + + #[test] + fn parse_lock_pid_handles_missing_or_garbage() { + assert_eq!(parse_lock_pid(""), None); + assert_eq!(parse_lock_pid("started_at=2026-06-06T13:29:25Z\n"), None); + assert_eq!(parse_lock_pid("pid=not-a-number\n"), None); + // Trailing whitespace on the pid value should still parse. + assert_eq!(parse_lock_pid("pid=42 \nstarted_at=x\n"), Some(42)); + } + + #[test] + fn classify_existing_lock_dead_pid() { + // PID 2^31 - 7 is wildly above any realistic pid_max and is essentially + // guaranteed not to exist on a normal host. We assert pid_alive() agrees. + let definitely_dead_pid: u32 = (i32::MAX as u32) - 7; + assert!( + !pid_alive(definitely_dead_pid), + "test precondition: pid {} should not be alive on this host", + definitely_dead_pid + ); + + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("sync.lock"); + let body = format!( + "pid={}\nstarted_at=2026-06-06T13:29:25Z\n", + definitely_dead_pid + ); + fs::write(&path, &body).unwrap(); + + assert_eq!( + classify_existing_lock(&path, &body), + StaleVerdict::DeadPid(definitely_dead_pid) + ); + } + + #[test] + fn classify_existing_lock_malformed_treated_as_stale() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("sync.lock"); + let body = "this is not what we ever write"; + fs::write(&path, body).unwrap(); + + assert_eq!(classify_existing_lock(&path, body), StaleVerdict::Malformed); + } + + #[test] + fn classify_existing_lock_live_pid_active() { + // Our own PID is alive by definition. + let pid = std::process::id(); + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("sync.lock"); + let body = format!("pid={}\nstarted_at=2026-06-06T13:29:25Z\n", pid); + fs::write(&path, &body).unwrap(); + + // A fresh lockfile owned by a live pid must be Active. + assert_eq!(classify_existing_lock(&path, &body), StaleVerdict::Active); + } + + #[test] + fn acquire_takes_over_dead_pid_lock_and_cleans_up_on_drop() { + let tmp = tempfile::tempdir().unwrap(); + let state_dir = tmp.path(); + let lock_path = state_dir.join("sync.lock"); + + // Pre-stage a stale lockfile from a definitely-not-running PID. + let dead_pid: u32 = (i32::MAX as u32) - 7; + assert!( + !pid_alive(dead_pid), + "precondition: pid {} must not exist", + dead_pid + ); + fs::write( + &lock_path, + format!("pid={}\nstarted_at=2026-06-06T13:29:25Z\n", dead_pid), + ) + .unwrap(); + assert!( + lock_path.exists(), + "precondition: stale lock should be on disk" + ); + + // Acquire must succeed despite the pre-existing file. + let lock = SyncLock::acquire(state_dir).expect("should take over stale lock"); + + // And the file should now contain OUR pid (the takeover replaced it). + let contents = fs::read_to_string(&lock_path).unwrap(); + assert_eq!( + parse_lock_pid(&contents), + Some(std::process::id()), + "after takeover the lockfile should record the current process" + ); + + // Drop releases it (cleans up on disk). + drop(lock); + assert!(!lock_path.exists(), "Drop should remove the lockfile"); + } + + #[test] + fn acquire_refuses_when_lock_is_actively_held_by_live_pid() { + let tmp = tempfile::tempdir().unwrap(); + let state_dir = tmp.path(); + + // Hold a lock with the current (alive) process — second acquire from + // the same process must bail. Drop releases it at end of scope. + let _held = SyncLock::acquire(state_dir).expect("first acquire"); + let err = SyncLock::acquire(state_dir).expect_err("second acquire should bail"); + let msg = format!("{err}"); + assert!( + msg.contains("sync is al actief"), + "expected the 'sync already active' message, got: {msg}" + ); + } } diff --git a/helpers/mister/src/watcher.rs b/helpers/mister/src/watcher.rs index 8c905f1..6fe0a54 100644 --- a/helpers/mister/src/watcher.rs +++ b/helpers/mister/src/watcher.rs @@ -62,7 +62,14 @@ pub fn run_watch( let mut cycles: u32 = 0; while !stop_flag.load(Ordering::SeqCst) { - let report = run_sync( + // Per-cycle errors must NOT exit the watcher. Capture the result + // so we can log + continue, only stop_flag (set by SIGINT/SIGTERM) + // breaks the loop. Surfaced as a real reliability bug 2026-06-06: + // a brief lock-contention race had the watcher exit mid-day with + // `Error: sync is al actief (lockfile bestaat)` and stop syncing + // entirely until manual restart. With the stale-lock takeover from + // the previous PR enabled, races become more likely, not less. + let report_result = run_sync( config, auth, &SyncOptions { @@ -75,25 +82,35 @@ pub fn run_watch( default_source_kind: options.default_source_kind.clone(), }, verbose, - )?; + ); scan_once = false; deep_scan_once = false; apply_scan_once = false; cycles += 1; - if !quiet { - println!( - "Watch cycle {} complete: scanned={} uploaded={} downloaded={} in_sync={} conflicts={} skipped={} errors={}", - cycles, - report.scanned, - report.uploaded, - report.downloaded, - report.in_sync, - report.conflicts, - report.skipped, - report.errors - ); + match report_result { + Ok(report) => { + if !quiet { + println!( + "Watch cycle {} complete: scanned={} uploaded={} downloaded={} in_sync={} conflicts={} skipped={} errors={}", + cycles, + report.scanned, + report.uploaded, + report.downloaded, + report.in_sync, + report.conflicts, + report.skipped, + report.errors + ); + } + } + Err(err) => { + eprintln!( + "waarschuwing: cyclus {} mislukt; doorgaan met volgende cyclus: {err:#}", + cycles + ); + } } if let Some(max_cycles) = options.max_cycles diff --git a/helpers/mister/tests/e2e_smoke.rs b/helpers/mister/tests/e2e_smoke.rs index 1bd8469..0acf5a6 100644 --- a/helpers/mister/tests/e2e_smoke.rs +++ b/helpers/mister/tests/e2e_smoke.rs @@ -286,3 +286,87 @@ fn sync_restores_missing_file_from_cloud_using_saved_adapter_metadata() { assert_eq!(fs::read(&missing_save_path).unwrap(), restored_bytes); assert!(download_mock.calls() >= 1); } + +#[test] +fn watch_loop_survives_per_cycle_sync_error() { + // Regression test for the reliability bug surfaced 2026-06-06: when the + // per-cycle sync invocation returned an Err (e.g. lock-contention bail + // "sync is al actief"), the entire watcher process exited instead of + // logging and moving on. Asserts: + // 1. exit status is success (0) — watcher did not propagate the cycle's + // Err out of the loop. + // 2. stderr contains the new "waarschuwing: cyclus N mislukt" warning + // for cycles 1 AND 2 — proves cycle 1's failure didn't kill the + // process and cycle 2 also ran. + // + // Failure shape we exercise: pre-stage state/sync.lock with the test + // process's own PID (alive by definition), so SyncLock::acquire's + // classify_existing_lock returns Active, and run_sync bails. + use predicates::str::contains; + + let server = MockServer::start(); + server.mock(|when, then| { + when.method(POST).path("/auth/token/app-password"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"success":true,"token":"tok_watch_err","expiresInDays":7}"#); + }); + server.mock(|when, then| { + when.method(GET).path("/auth/me"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"success":true,"user":{"email":"watch-err@example.com"}}"#); + }); + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path().join("root"); + let state_dir = tmp.path().join("state"); + fs::create_dir_all(&root).unwrap(); + fs::create_dir_all(&state_dir).unwrap(); + + let config = write_config(&tmp, &server, &root, &state_dir); + + Command::cargo_bin("sgm-mister-helper") + .unwrap() + .arg("--config") + .arg(&config) + .arg("login") + .arg("--email") + .arg("watch-err@example.com") + .arg("--app-password") + .arg("pw") + .assert() + .success(); + + // Pre-stage a lockfile owned by the (live) test process, so SyncLock::acquire + // sees an Active lock and run_sync bails on every cycle. + let live_pid = std::process::id(); + let lock_path = state_dir.join("sync.lock"); + fs::write( + &lock_path, + format!("pid={}\nstarted_at=2026-06-06T22:14:25Z\n", live_pid), + ) + .unwrap(); + assert!( + lock_path.exists(), + "precondition: lock file should be on disk" + ); + + Command::cargo_bin("sgm-mister-helper") + .unwrap() + .arg("--config") + .arg(&config) + .arg("watch") + .arg("--watch-interval") + .arg("1") + .arg("--max-cycles") + .arg("2") + .assert() + .success() // exit 0 — without the fix this would be Err propagation + .stderr(contains("waarschuwing: cyclus 1 mislukt")) + .stderr(contains("waarschuwing: cyclus 2 mislukt")); + + // Lockfile should still be on disk — the watcher couldn't take it over + // (live PID = Active verdict), and our fix means it didn't try to. + assert!(lock_path.exists(), "active lock should still be on disk"); +} diff --git a/helpers/steamdeck/Cargo.toml b/helpers/steamdeck/Cargo.toml index 1f9b890..134f697 100644 --- a/helpers/steamdeck/Cargo.toml +++ b/helpers/steamdeck/Cargo.toml @@ -21,6 +21,11 @@ sha2 = "0.10" time = { version = "0.3", features = ["formatting", "macros"] } walkdir = "2.5" +# Unix-only: PID liveness check (kill(pid, 0)) for stale-lockfile recovery +# in syncer::pid_alive. See helpers/mister/Cargo.toml for the rationale. +[target.'cfg(unix)'.dependencies] +libc = "0.2" + [dev-dependencies] assert_cmd = "2.0" httpmock = "0.8" diff --git a/helpers/steamdeck/src/api.rs b/helpers/steamdeck/src/api.rs index 98d4dd6..140370d 100644 --- a/helpers/steamdeck/src/api.rs +++ b/helpers/steamdeck/src/api.rs @@ -73,6 +73,12 @@ pub struct LatestSaveResponse { pub sha256: Option, pub version: Option, pub id: Option, + /// Bare lowercased on-disk extension of the canonical record (e.g. "srm", + /// "rtc"), from RSM's enriched /latest. Absent on older RSM and on + /// exists=false responses — `None` then, which the download guard treats as + /// "no cross-ext check" (degrades to prior behavior). + #[serde(default)] + pub format: Option, } #[derive(Debug, Clone)] diff --git a/helpers/steamdeck/src/cli.rs b/helpers/steamdeck/src/cli.rs index 10e7254..dc4226a 100644 --- a/helpers/steamdeck/src/cli.rs +++ b/helpers/steamdeck/src/cli.rs @@ -143,6 +143,26 @@ pub enum Commands { #[arg(long = "poll-interval", default_value_t = 5)] poll_interval: u64, }, + /// Black-box decision dump for the save-format verification harness: for each + /// fixture, print classify (system/format/sidecar), upload-slot, the simulated + /// cloud-latest format, the write-guard verdict (allow/skip) and the target. + /// Drives the REAL classify / slot / guard code over real save fixtures + a + /// simulated cloud-latest state — no network, no sync, deterministic. + ExplainSync { + /// Directory of save fixtures. Lay them under system-hinted subdirs + /// (e.g. `gbc/`, `n64/`, `psx/`) — the classifier resolves the system + /// from the path. Also holds `fixtures-manifest.tsv` + /// (`\t` per line) unless --manifest overrides. + #[arg(long = "fixtures-dir")] + fixtures_dir: PathBuf, + /// JSON map keyed `::` -> the /latest object + /// (`{exists, format?, version?, id?}`): the simulated cloud state. + #[arg(long = "cloud-state")] + cloud_state: PathBuf, + /// Manifest path override (default `/fixtures-manifest.tsv`). + #[arg(long)] + manifest: Option, + }, } #[derive(Debug, Subcommand)] diff --git a/helpers/steamdeck/src/config.rs b/helpers/steamdeck/src/config.rs index 34df523..9b435cc 100644 --- a/helpers/steamdeck/src/config.rs +++ b/helpers/steamdeck/src/config.rs @@ -19,6 +19,7 @@ pub struct ConfigOverrides { pub force_upload: Option, pub dry_run: Option, pub route_prefix: Option, + pub secure: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -34,6 +35,7 @@ pub struct AppConfig { pub force_upload: bool, pub dry_run: bool, pub route_prefix: String, + pub secure: bool, pub binary_dir: PathBuf, pub config_path: PathBuf, } @@ -70,7 +72,8 @@ impl LoadedConfig { impl AppConfig { pub fn base_url(&self) -> String { - format!("http://{}:{}", self.url, self.port) + let scheme = if self.secure { "https" } else { "http" }; + format!("{}://{}:{}", scheme, self.url, self.port) } pub fn resolved_root(&self) -> Result { @@ -164,6 +167,7 @@ impl AppConfig { "ROUTE_PREFIX", "", )?); + let secure = choose_bool(overrides.secure, env_values, ini_values, "SECURE", false)?; if watch_interval == 0 { bail!("WATCH_INTERVAL moet >= 1 zijn"); @@ -181,6 +185,7 @@ impl AppConfig { force_upload, dry_run, route_prefix, + secure, binary_dir, config_path: config_path.to_path_buf(), }) @@ -498,6 +503,34 @@ SAVE_PATH="/home/snes9x/save" assert_eq!(cfg.base_url(), "http://192.168.1.9:9096"); } + #[test] + fn base_url_uses_https_when_secure_true() { + let cfg = test_sources( + ConfigOverrides::default(), + &[ + ("URL", "saves.example.com"), + ("PORT", "443"), + ("SECURE", "true"), + ], + &[], + ) + .unwrap(); + assert_eq!(cfg.base_url(), "https://saves.example.com:443"); + assert!(cfg.secure); + } + + #[test] + fn base_url_remains_http_when_secure_unset_for_backwards_compat() { + let cfg = test_sources( + ConfigOverrides::default(), + &[("URL", "192.168.1.9"), ("PORT", "9096")], + &[], + ) + .unwrap(); + assert_eq!(cfg.base_url(), "http://192.168.1.9:9096"); + assert!(!cfg.secure); + } + #[test] fn precedence_cli_over_env_over_ini_over_default() { let cfg = test_sources( diff --git a/helpers/steamdeck/src/lib.rs b/helpers/steamdeck/src/lib.rs index 1f4f179..bda4a46 100644 --- a/helpers/steamdeck/src/lib.rs +++ b/helpers/steamdeck/src/lib.rs @@ -52,6 +52,17 @@ pub fn run() -> Result<()> { bail!("`--verbose` en `--quiet` kunnen niet tegelijk actief zijn"); } + // The verification harness is fully offline (no eGauge config / auth): handle + // it before any config load so it runs anywhere with just the fixtures. + if let Commands::ExplainSync { + fixtures_dir, + cloud_state, + manifest, + } = &cli.command + { + return run_explain_sync(fixtures_dir, cloud_state, manifest.as_deref()); + } + let global_overrides = ConfigOverrides { url: cli.url.clone(), api_url: cli.api_url.clone(), @@ -65,6 +76,7 @@ pub fn run() -> Result<()> { force_upload: None, dry_run: None, route_prefix: cli.route_prefix.clone(), + secure: None, }; let loaded = LoadedConfig::load(cli.config.clone(), &global_overrides)?; @@ -687,11 +699,106 @@ fn dispatch(cli: Cli, loaded: LoadedConfig) -> Result<()> { let cfg = loaded.config.clone(); run_device_auth(&cfg, poll_interval, cli.quiet)?; } + Commands::ExplainSync { .. } => unreachable!("handled offline in run()"), } Ok(()) } +/// Black-box decision dump for the save-format verification harness. For each +/// fixture (named in `/fixtures-manifest.tsv` as +/// `\t`), drive the REAL classify / slot / write-guard +/// code over the file's bytes + a simulated cloud-latest state, and print one +/// tab-separated decision row. No network, no sync — deterministic, so a fixture +/// matrix can assert the bytes never get clobbered or mis-routed. +fn run_explain_sync( + fixtures_dir: &Path, + cloud_state: &Path, + manifest: Option<&Path>, +) -> Result<()> { + use std::collections::BTreeMap; + + let manifest_path = manifest.map_or_else( + || fixtures_dir.join("fixtures-manifest.tsv"), + Path::to_path_buf, + ); + let manifest_text = fs::read_to_string(&manifest_path) + .with_context(|| format!("reading manifest {}", manifest_path.display()))?; + let mut identities: BTreeMap = BTreeMap::new(); + for line in manifest_text.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + let mut parts = line.splitn(2, '\t'); + if let (Some(file), Some(identity)) = (parts.next(), parts.next()) { + identities.insert( + file.trim().to_string(), + identity.trim().to_ascii_lowercase(), + ); + } + } + + let cloud_text = fs::read_to_string(cloud_state) + .with_context(|| format!("reading cloud-state {}", cloud_state.display()))?; + let cloud: serde_json::Value = serde_json::from_str(&cloud_text) + .with_context(|| format!("parsing cloud-state JSON {}", cloud_state.display()))?; + + println!( + "# explain-sync: {} fixture(s) from {}", + identities.len(), + fixtures_dir.display() + ); + println!("file\tsystem\tformat\tsidecar\tupload_slot\tcloud_key\tcloud_format\tguard"); + for (file, identity) in &identities { + let path = fixtures_dir.join(file); + let bytes = match fs::read(&path) { + Ok(bytes) => bytes, + Err(err) => { + println!("{file}\t"); + continue; + } + }; + let Some(classification) = crate::scanner::classify(&path, &bytes) else { + println!("{file}\t\t-\t-\t-\t-\t-\t-"); + continue; + }; + let slot = + crate::syncer::resolve_slot_name_for_sync(&classification.system, &path, "default"); + let key = format!("{identity}::{slot}"); + let entry = cloud.get(&key); + let exists = entry + .and_then(|value| value.get("exists")) + .and_then(serde_json::Value::as_bool) + .unwrap_or(false); + let cloud_format = entry + .and_then(|value| value.get("format")) + .and_then(serde_json::Value::as_str) + .map(str::to_ascii_lowercase); + let guard = match (exists, cloud_format.as_deref()) { + (true, Some(canonical)) => { + if crate::scanner::allow_write(canonical, &classification.format) { + "ALLOW" + } else { + "SKIP" + } + } + _ => "n/a (no canonical)", + }; + println!( + "{file}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + classification.system, + classification.format, + classification.is_sidecar, + slot, + key, + cloud_format.as_deref().unwrap_or("-"), + guard, + ); + } + Ok(()) +} + fn parse_emulator_profile(value: &str) -> Result { EmulatorProfile::parse(value).ok_or_else(|| anyhow!("ongeldig emulatorprofiel '{}'", value)) } diff --git a/helpers/steamdeck/src/scanner.rs b/helpers/steamdeck/src/scanner.rs index 562059a..2b0f952 100644 --- a/helpers/steamdeck/src/scanner.rs +++ b/helpers/steamdeck/src/scanner.rs @@ -545,9 +545,62 @@ pub fn classify_supported_save( rom_path: Option<&Path>, ) -> Option { let save_ext = path_extension(save_path)?; - let save_size = save_path.metadata().ok()?.len(); - if !is_plausible_save_size(save_size) || looks_plain_text(save_path) { + let plain_text = looks_plain_text(save_path); + classify_core(save_path, save_ext, save_size, plain_text, rom_path) +} + +/// Black-box classification for the verification harness: the system slug plus +/// the bare lowercased on-disk `format` (ext) and whether it's a sidecar. Driven +/// from raw bytes (size/content) + the path (name/dir hints) — no filesystem +/// read — so it is table-driveable against real save fixtures. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Classification { + pub system: String, + pub format: String, + pub is_sidecar: bool, +} + +/// Classify a save from its `save_path` (name + directory hints) and its raw +/// `bytes` (size + plain-text screen). Returns `None` for an unrecognized / +/// implausible save. The `format` is the bare lowercased extension and matches +/// RSM's `/latest` `format` field + the on-disk target extension. +pub fn classify(save_path: &Path, bytes: &[u8]) -> Option { + let save_ext = path_extension(save_path)?; + let classified = classify_core( + save_path, + save_ext.clone(), + bytes.len() as u64, + bytes_look_plain_text(bytes), + None, + )?; + Some(Classification { + system: classified.system_slug, + is_sidecar: is_sidecar(&save_ext), + format: save_ext, + }) +} + +fn bytes_look_plain_text(bytes: &[u8]) -> bool { + let sample = &bytes[..bytes.len().min(4096)]; + if sample.is_empty() || sample.contains(&0) { + return false; + } + let printable = sample + .iter() + .filter(|value| matches!(**value, b'\n' | b'\r' | b'\t' | 0x20..=0x7e)) + .count(); + printable.saturating_mul(100) >= sample.len().saturating_mul(95) +} + +fn classify_core( + save_path: &Path, + save_ext: String, + save_size: u64, + plain_text: bool, + rom_path: Option<&Path>, +) -> Option { + if !is_plausible_save_size(save_size) || plain_text { return None; } @@ -657,19 +710,24 @@ pub fn classify_supported_save( &save_lower, &[ "master system", + "mastersystem", "/sms/", "\\sms\\", "game gear", + "gamegear", "/gg/", "\\gg\\", "sega 32x", "sega-32x", "sega32x", + "sega32xjp", + "sega32xna", "/32x/", "\\32x\\", "mega cd", "mega-cd", "megacd", + "megacdjp", "sega cd", "sega-cd", "segacd", @@ -684,11 +742,13 @@ pub fn classify_supported_save( "genesis", "mega drive", "megadrive", + "megadrivejp", "/md/", "\\md\\", "/gen/", "\\gen\\", "saturn", + "saturnjp", "dreamcast", "sega", ], @@ -724,6 +784,54 @@ pub fn classify_supported_save( return None; } + if contains_any( + &save_lower, + &[ + "pc engine", + "pcengine", + "pc-engine", + "pcenginecd", + "pc-engine-cd", + "turbografx", + "turbo grafx", + "turbo-grafx", + "tgfx16", + "/tg16/", + "\\tg16\\", + "/tg-cd/", + "\\tg-cd\\", + "/tg_cd/", + "\\tg_cd\\", + "supergrafx", + "super grafx", + "/sgx/", + "\\sgx\\", + "/pce/", + "\\pce\\", + "/pce-cd/", + "\\pce-cd\\", + "/pcecd/", + "\\pcecd\\", + "nec - pc engine", + "nec - turbografx", + "mednafen-pce", + "beetle pce", + "beetle-pce", + ], + ) { + let slug = infer_nec_slug(&save_lower); + if is_plausible_save_for_system(&save_ext, save_size, slug) { + return classify_if_valid( + save_path, + &save_ext, + save_size, + slug, + format!("path hint nec + .{} ({} bytes)", save_ext, save_size), + ); + } + return None; + } + if contains_any( &save_lower, &[ @@ -848,6 +956,50 @@ fn path_extension(path: &Path) -> Option { .map(|value| value.to_ascii_lowercase()) } +/// Save-format extensions that are SIDECARS — independent companion state that +/// must never slot-collide with, or overwrite, the primary battery save. +/// Authoritative set owned by the save-format spec: +/// rtc — GB/GBC real-time clock (Pokemon Crystal/Gold/Silver, Harvest Moon). +/// mpk / cpk — N64 Controller-Pak / mempak (separate from the .eep/.fla/.sra +/// battery save). +/// Extensible later (e.g. Dreamcast VMU); these three are the in-scope set. +pub const SIDECAR_FORMATS: [&str; 3] = ["rtc", "mpk", "cpk"]; + +/// Whether a bare lowercased extension is a sidecar format. +pub fn is_sidecar(ext: &str) -> bool { + SIDECAR_FORMATS.contains(&ext) +} + +/// Download write-guard: may a cloud record of `canonical_format` be written to +/// a local file of extension `target_ext`? Both are bare lowercased extensions +/// (matching the RSM `/latest` `format` field and the on-disk target ext). +/// +/// If EITHER side is a sidecar, the write is allowed ONLY when the formats are +/// identical — a `.rtc` canonical may land only on a `.rtc` target, never on a +/// `.srm` (and vice versa). If NEITHER side is a sidecar, the write is always +/// allowed: primary↔primary differences are legitimate container conversions +/// (PSX `mcr`→`vmp`, raw `srm`→`sav`) handled elsewhere and must not be blocked. +pub fn allow_write(canonical_format: &str, target_ext: &str) -> bool { + let canonical = canonical_format.to_ascii_lowercase(); + let target = target_ext.to_ascii_lowercase(); + if is_sidecar(&canonical) || is_sidecar(&target) { + return canonical == target; + } + true +} + +/// Human-readable slot LABEL for a sidecar extension (save-format spec). +/// Used to build the RSM-native distinct slot name `" (