From b528030567cec183158ebd5af4dc76a9be77564a Mon Sep 17 00:00:00 2001
From: ConsultingFuture4200 <consultingfutures@gmail.com>
Date: Mon, 15 Jun 2026 11:37:31 -0700
Subject: [PATCH] feat(custom_routers): add FusionGateRouter route-vs-fuse
 meta-router

- Gate each query between single-model routing and OpenRouter openrouter:fusion
  (panel + judge), with a three-tier dial: single / budget_fusion / fusion
- Isolate the beta openrouter:fusion server tool behind FusionExecutor (one
  blast point); graceful judge-failure fallback; per-query dollar cost_ceiling
- Capability-scored panel selection with Quality/Budget preset fallback
- --route-only spend-free preview; 6+ config keys; secret-scrubbed fusion logging
  producing FusionFactory-style training rows; offline retrain step
- Three-arm offline eval harness + fixtures (mock = zero spend); 42 tests
- Zero core edits; one optional provider; local fan-out fallback left as follow-up
---
 custom_routers/fusion_gate/.gitignore         |  15 +
 custom_routers/fusion_gate/PR_BODY.md         |  98 +++
 custom_routers/fusion_gate/README.md          | 176 ++++
 custom_routers/fusion_gate/__init__.py        |  34 +
 custom_routers/fusion_gate/capability.py      | 390 +++++++++
 custom_routers/fusion_gate/config.yaml        |  58 ++
 custom_routers/fusion_gate/eval/RESULTS.md    | 107 +++
 custom_routers/fusion_gate/eval/__init__.py   |  13 +
 .../fusion_gate/eval/eval_harness.py          | 813 ++++++++++++++++++
 .../eval/fixtures/hard_slice.jsonl            |  16 +
 .../eval/fixtures/llm_candidates.json         |  58 ++
 .../eval/fixtures/routing_data.jsonl          |  28 +
 custom_routers/fusion_gate/eval/retrain.py    | 464 ++++++++++
 custom_routers/fusion_gate/executor.py        | 429 +++++++++
 custom_routers/fusion_gate/fusion_log.py      | 204 +++++
 custom_routers/fusion_gate/gate.py            | 328 +++++++
 custom_routers/fusion_gate/router.py          | 270 ++++++
 custom_routers/fusion_gate/tests/__init__.py  |   1 +
 custom_routers/fusion_gate/tests/conftest.py  |  37 +
 .../fusion_gate/tests/test_capability.py      | 197 +++++
 .../fusion_gate/tests/test_eval_harness.py    | 227 +++++
 .../fusion_gate/tests/test_executor.py        | 228 +++++
 .../fusion_gate/tests/test_fusion_log.py      | 209 +++++
 custom_routers/fusion_gate/tests/test_gate.py | 202 +++++
 .../fusion_gate/tests/test_router.py          | 380 ++++++++
 25 files changed, 4982 insertions(+)
 create mode 100644 custom_routers/fusion_gate/.gitignore
 create mode 100644 custom_routers/fusion_gate/PR_BODY.md
 create mode 100644 custom_routers/fusion_gate/README.md
 create mode 100644 custom_routers/fusion_gate/__init__.py
 create mode 100644 custom_routers/fusion_gate/capability.py
 create mode 100644 custom_routers/fusion_gate/config.yaml
 create mode 100644 custom_routers/fusion_gate/eval/RESULTS.md
 create mode 100644 custom_routers/fusion_gate/eval/__init__.py
 create mode 100644 custom_routers/fusion_gate/eval/eval_harness.py
 create mode 100644 custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl
 create mode 100644 custom_routers/fusion_gate/eval/fixtures/llm_candidates.json
 create mode 100644 custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl
 create mode 100644 custom_routers/fusion_gate/eval/retrain.py
 create mode 100644 custom_routers/fusion_gate/executor.py
 create mode 100644 custom_routers/fusion_gate/fusion_log.py
 create mode 100644 custom_routers/fusion_gate/gate.py
 create mode 100644 custom_routers/fusion_gate/router.py
 create mode 100644 custom_routers/fusion_gate/tests/__init__.py
 create mode 100644 custom_routers/fusion_gate/tests/conftest.py
 create mode 100644 custom_routers/fusion_gate/tests/test_capability.py
 create mode 100644 custom_routers/fusion_gate/tests/test_eval_harness.py
 create mode 100644 custom_routers/fusion_gate/tests/test_executor.py
 create mode 100644 custom_routers/fusion_gate/tests/test_fusion_log.py
 create mode 100644 custom_routers/fusion_gate/tests/test_gate.py
 create mode 100644 custom_routers/fusion_gate/tests/test_router.py

diff --git a/custom_routers/fusion_gate/.gitignore b/custom_routers/fusion_gate/.gitignore
new file mode 100644
index 0000000..dcd54e0
--- /dev/null
+++ b/custom_routers/fusion_gate/.gitignore
@@ -0,0 +1,15 @@
+# Compiled Python artifacts must not be tracked. These are build output, not
+# source, and were committed by mistake. To purge ones already tracked:
+#   git rm -r --cached custom_routers/fusion_gate/**/__pycache__
+__pycache__/
+*.pyc
+*.pyo
+
+# Eval harness runtime output. The harness writes results.csv / results.md here
+# on every run; this is build output, not source, and must never be tracked. The
+# committed, intentional report lives at eval/RESULTS.md instead.
+eval/out/
+
+# The repo root .gitignore ignores *.jsonl globally. Re-include the committed
+# eval fixtures, which are source (the offline --mock harness depends on them).
+!eval/fixtures/*.jsonl
diff --git a/custom_routers/fusion_gate/PR_BODY.md b/custom_routers/fusion_gate/PR_BODY.md
new file mode 100644
index 0000000..41c1ea3
--- /dev/null
+++ b/custom_routers/fusion_gate/PR_BODY.md
@@ -0,0 +1,98 @@
+# Add FusionGateRouter — a route-vs-fuse meta-router
+
+## Summary
+
+Adds `FusionGateRouter`, a self-contained custom router plugin under
+`custom_routers/fusion_gate/` that gates each query between the cheap
+single-model path and a multi-model **fusion** path, with fusion delegated to
+OpenRouter's `openrouter:fusion` server tool. **Zero edits to core `llmrouter/`
+code** — the plugin is auto-discovered via the existing `custom_routers/`
+mechanism, exactly like `randomrouter` and `thresholdrouter`.
+
+## Motivation
+
+LLMRouter today picks *which single model* answers a query. The interesting
+lever for hard queries is a different one: **route vs. fuse** — decide whether a
+query is worth running a panel of models and synthesizing their answers. This PR
+makes route-vs-fuse the **primary per-query dial**, expressed as a three-tier
+escalation driven by estimated difficulty:
+
+```
+single  ->  budget_fusion (cheap panel)  ->  fusion (full Quality panel)
+```
+
+Cheap queries stay cheap; only the hard ones escalate, and the middle tier lets
+mid-difficulty queries fuse on a budget panel instead of jumping straight to the
+full Quality panel.
+
+## What's included
+
+**In scope:**
+- `FusionGateRouter` — the route-vs-fuse gate (difficulty + confidence) plus capability-scored panel selection with a Quality/Budget preset fallback.
+- An `openrouter:fusion` adapter (`executor.py`) — the single, isolated blast point for the beta server-tool API.
+- A configurable surface (`threshold`, `k`, `judge`, `provider`/`base_url`, `panel_preset`, `cost_ceiling`, `est_completion_tokens`) and a `--route-only` spend-free preview that returns the decision + intended panel/judge without any API call.
+- A per-query **dollar** cost guard (`cost_ceiling`) that downgrades fusion → single when the projected spend exceeds the cap.
+- Secret-scrubbed fusion-call logging (`fusion_log.py`) producing FusionFactory-style `(query, model, response, performance)` training rows.
+- A three-arm offline eval harness + bundled fixtures (`eval/`) and an offline retrain step.
+- Self-contained: **ONE optional provider** (OpenRouter), **ZERO core edits**.
+
+**Out of scope (follow-ups):**
+- **Local fan-out fallback is OUT of this PR.** Without an OpenRouter key only `--route-only` is exercisable. The executor interface is the seam a provider-agnostic local fan-out path would slot behind later — happy to add it if maintainers want it.
+- A learned gate (the gate currently uses a duck-typed difficulty estimator with a deterministic lexical fallback so it runs with no trained model).
+
+## Eval results
+
+> **All committed numbers are from MOCK fixtures** (deterministic stub executor,
+> zero spend, no network). They validate harness wiring and metric math, **not**
+> real model quality. **Real numbers require a keyed live run**
+> (`OPENROUTER_API_KEY` / `API_KEYS` set) against a real benchmark slice — that
+> path is documented but intentionally not wired into the offline harness so a
+> stray run cannot spend. See `eval/RESULTS.md`.
+
+Dataset: 16 held-out queries (6 easy + 10 hard; GSM8K / MATH / GPQA / MBPP).
+Quality / blended cost / escalation `p` are over the full 16-query dataset; **gate
+precision is computed over the same fixed 10-query hard slice for every arm** so the
+arms are comparable (`always_route` makes no escalation decision → N/A). Slice
+definitions are documented in `eval/RESULTS.md`. Blended cost is an estimated
+**per-query dollar** amount.
+
+| Arm | n | Quality | Blended cost ($/query) | Escalation p | Gate-precision (hard slice) |
+|-----|---|---------|------------------------|--------------|------------------------------|
+| always_route | 16 | 0.3750 | 0.000650 | 0.0000 | n/a |
+| always_fuse | 16 | 1.0000 | 0.001137 | 1.0000 | 1.0000 |
+| fusion_gate | 16 | 1.0000 | 0.000767 | 0.6250 | 1.0000 |
+
+- **Quality target** — gate ≥ 95% of always-fuse quality: 1.0000 vs target 0.9500 → **PASS** (mock).
+- **Cost target** — blended cost ≤ 1.6× always-route: ratio 1.18 → **PASS** (mock).
+- **Gate precision** — escalated answers beating best single, over the hard slice: fusion_gate 10/10, always_fuse 10/10 → **measured** (mock).
+- **Retrain delta** — offline log→retrain holds gate-precision at 1.0000 (threshold refit 0.400 → 0.520, budget_threshold 0.100 → 0.180). **Real delta pending a keyed live run.**
+
+## FusionFactory & continual learning
+
+Each fusion call yields a panel of per-model responses plus a judge synthesis —
+exactly the `(query, model, response, performance)` observations FusionFactory
+needs. `fusion_log.to_training_rows` decomposes them into rows shaped for
+`llmrouter/data/api_calling_evaluation.py`, and the retrain step replays the
+logged sink to refit the gate thresholds offline. This directly serves the
+repo's **continual-learning TODO**: the router's own fusion traffic becomes the
+training signal that sharpens the route-vs-fuse gate over time, with no separate
+labeling pass required.
+
+## Beta server-tool caveat
+
+`openrouter:fusion` is an OpenRouter **BETA** server tool; its request/response
+shape may change. All OpenRouter HTTP specifics are confined to `executor.py`
+(request body, tool type, key resolution, transport, payload parsing), so an
+upstream beta change touches one file. The executor degrades gracefully on judge
+failure (synthesizes from panel responses). No API keys, auth headers, or raw
+provider payloads are ever logged.
+
+## Testing
+
+Torch-free, fully offline (HTTP mocked):
+
+```bash
+pytest custom_routers/fusion_gate/tests/
+python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \
+  --out custom_routers/fusion_gate/eval/out
+```
diff --git a/custom_routers/fusion_gate/README.md b/custom_routers/fusion_gate/README.md
new file mode 100644
index 0000000..d321bb7
--- /dev/null
+++ b/custom_routers/fusion_gate/README.md
@@ -0,0 +1,176 @@
+# FusionGateRouter
+
+**Type:** Meta-router (route-vs-fuse gate). No training required to run; an optional offline retrain step refits the gate from logged fusion calls.
+
+**Description:** A per-query gate that decides between the cheap **single-model**
+path (classic LLMRouter routing) and a **fusion** path that runs a panel of
+models and synthesizes their answers. Fusion is delegated to the OpenRouter
+`openrouter:fusion` server tool (BETA — see the caveat below). Routing is
+spend-free: the decision is computed locally and only `fuse()` ever calls the
+provider.
+
+The primary per-query dial is **route vs. fuse**, expressed as three tiers:
+
+```
+difficulty < budget_threshold          ->  single         (cheapest single model)
+budget_threshold <= difficulty < threshold  ->  budget_fusion  (cheap Budget panel)
+difficulty >= threshold                ->  fusion         (full Quality panel)
+```
+
+Set `budget_threshold: null` (or `>= threshold`) to disable the middle tier and
+collapse to plain single/fusion. A `high_stakes: true` flag on a query forces
+the full Quality `fusion` tier regardless of difficulty.
+
+## Usage
+
+```bash
+# Inference (routes, then fuses via openrouter:fusion if the gate escalates)
+llmrouter infer --router fusion_gate \
+  --config custom_routers/fusion_gate/config.yaml \
+  --query "Prove that the square root of 2 is irrational."
+
+# Route-only — compute the decision with ZERO spend / no network call
+llmrouter infer --router fusion_gate \
+  --config custom_routers/fusion_gate/config.yaml \
+  --query "What is the capital of France?" \
+  --route-only
+```
+
+`--route-only` returns the decision dict (tier, panel, judge, projected cost)
+without ever calling OpenRouter. Spend happens only when `fuse()` is invoked.
+
+## Decision contract
+
+`route_single` returns one of two shapes (both carry `strategy`, `tier`, and
+`model_name` for drop-in CLI compatibility):
+
+- **single:** `{query, strategy="single", tier="single", model_name, predicted_llm, difficulty, confidence}`
+- **fusion:** `{query, strategy="fusion", tier="budget_fusion"|"fusion", panel[], judge, model_name, predicted_llm, difficulty, confidence, projected_cost}`
+
+When the cost guard fires, a fusion decision is **downgraded** to single and the
+result carries `downgraded_from`, `projected_cost`, and `cost_ceiling`.
+
+## Configuration
+
+All keys live under `hparam:` in `config.yaml` unless noted.
+
+| Key | Default | Purpose |
+|-----|---------|---------|
+| `threshold` | `0.5` | Difficulty cutoff to escalate to the full Quality `fusion` tier. |
+| `budget_threshold` | `0.3` | Lower boundary of the middle `budget_fusion` tier. `null` (or `>= threshold`) disables it. |
+| `k` | `3` | Panel size — maps to the tool's `analysis_models`. |
+| `judge` | `null` | Judge model slug — maps to the tool's `model`. `null` = use the outer model. |
+| `panel_preset` | `Quality` | Fallback preset (`Quality` / `Budget`) when capability data is unavailable for a query. |
+| `cost_ceiling` | `null` | Hard per-query **dollar** cap on the projected `Σ(panel)+judge` cost. `null` = off. See the cost-unit note. |
+| `est_completion_tokens` | `512` | Per-completion output-token estimate feeding the dollar cost projection. |
+| `provider` | `OpenRouter` | Informational; drives credential resolution. |
+| `base_url` | `https://openrouter.ai/api/v1` | OpenRouter endpoint hosting the beta server tool. Overrides the top-level `api_endpoint`. |
+| `log_sink_path` | `null` | JSONL sink for fusion-call logging. `null` = `fusion_log` default (`~/.llmrouter/openclaw_memory.jsonl`). |
+
+Top-level `data_path` / `metric` keys mirror the other custom routers
+(`randomrouter`, `thresholdrouter`); see `config.yaml` for the loaded candidate
+and routing-data paths.
+
+### Cost-unit note (important)
+
+`cost_ceiling` is compared against `project_cost`, which estimates the **per-query
+dollar cost** of the panel + judge. For each member,
+`(input_price · prompt_tokens + output_price · completion_tokens) / 1e6`, where
+`input_price` / `output_price` are the per-million-token prices from `llm_data`,
+`prompt_tokens ≈ len(query) // 4`, and `completion_tokens = est_completion_tokens`
+(default `512`). Set `cost_ceiling` in **dollars per query** (e.g. `0.05` ≈ five
+cents per query).
+
+## Panel selection
+
+Panels are chosen by `CapabilityScorer`, which scores candidates per **query
+category** (code / math / reasoning / general) from the LLMRouter routing-data
+tables, lightly cost-penalized. When no usable capability data exists for a
+query's category, selection falls back to a preset panel resolved by tier:
+`budget_fusion` -> `Budget`, anything else -> the configured `panel_preset`
+(`Quality` by default). The tier->preset mapping (`gate.resolve_preset`) is the
+single source of truth shared with the eval harness.
+
+## OpenRouter `openrouter:fusion` — BETA caveat
+
+The fusion path depends on OpenRouter's `openrouter:fusion` **server tool, which
+is BETA**: its request/response shape may change without notice. To contain that
+risk, **every OpenRouter HTTP specific lives in `executor.py` and nowhere else**
+— request body construction, the `openrouter:fusion` tool type, key resolution,
+transport, and payload parsing. An upstream beta change should touch that one
+file only. The executor also tolerates judge failure (status `ok` with
+`analysis` omitted): it synthesizes the answer from the panel responses rather
+than crashing.
+
+OpenRouter is the **one optional provider**. There is no local fan-out fallback
+(deferred to a follow-up); without a key, only `--route-only` is exercisable.
+
+## Logging
+
+Every `fuse()` call is appended (best-effort, append-only) to the JSONL sink via
+`fusion_log.log_fusion`. The sink is **secret-scrubbed**: API keys, auth
+headers, cookies, and the untouched provider payload are never written; only an
+enumerated set of fields (query, panel, judge, normalized responses, analysis,
+token/cost) is emitted. These rows are the FusionFactory-style training signal
+consumed by the offline retrain step.
+
+## Offline evaluation (`--mock`, zero spend)
+
+The three-arm harness compares `always_route`, `always_fuse`, and `fusion_gate`
+over a bundled hard-query slice (GSM8K / MATH / GPQA / MBPP). It is **offline by
+default** — a deterministic stub executor reads canned answers from fixtures; no
+network call is made and nothing is spent.
+
+```bash
+# Run the offline harness (mock is the default)
+python -m custom_routers.fusion_gate.eval.eval_harness --mock \
+  --out custom_routers/fusion_gate/eval/out
+
+# Include the mock retrain (M3 before/after) delta in results.md
+python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \
+  --out custom_routers/fusion_gate/eval/out
+```
+
+Tunable flags: `--threshold` (0.5), `--budget-threshold` (0.3), `--k` (2 in the
+harness — kept cost-bounded for the M2 target; the plugin config uses `k=3`),
+`--judge`, `--panel-preset`, `--dataset`, `--llm`, `--routing`, `--out`.
+Outputs: `<out>/results.csv` and `<out>/results.md` (the `--out` dir defaults to
+`eval/out/`, which is **gitignored** — runtime output, not source). The committed,
+intentional report lives at [`eval/RESULTS.md`](eval/RESULTS.md), which also documents
+the full-dataset vs hard-slice definitions used by the metrics.
+
+`--live` is intentionally **not** wired into this harness, so a stray run cannot
+spend; passing it errors out with a pointer to the keyed live-run path.
+
+Run the unit tests (torch-free, fully offline, HTTP mocked):
+
+```bash
+pytest custom_routers/fusion_gate/tests/
+```
+
+## Live run (keyed, real spend)
+
+The committed eval numbers are from MOCK fixtures. To produce real M1–M4 numbers
+you must run keyed against real models:
+
+```bash
+# Provide an OpenRouter key (never commit it):
+export OPENROUTER_API_KEY=sk-...           # or: export API_KEYS='{"OpenRouter": "sk-..."}'
+
+# Then build the real FusionGateRouter from config.yaml and route+fuse a real
+# benchmark slice; the executor makes the openrouter:fusion calls. The offline
+# eval harness does NOT make live calls by design — see eval/RESULTS.md.
+```
+
+Keys are resolved (in order) from an explicit `api_keys={"OpenRouter": "..."}`
+dict, `OPENROUTER_API_KEY`, or an `API_KEYS` JSON env var. Keys are never logged.
+
+## Files
+
+- `router.py` — `FusionGateRouter` entry point (MetaRouter contract).
+- `gate.py` — `RouteGate`, `GateDecision`, the three-tier dial, `resolve_preset`.
+- `capability.py` — `CapabilityScorer` panel selection.
+- `executor.py` — **the only** OpenRouter `openrouter:fusion` blast point.
+- `fusion_log.py` — secret-scrubbed JSONL logging + training-row decomposition.
+- `eval/` — three-arm offline harness, fixtures, retrain, and `RESULTS.md` (the committed report; `eval/out/` is gitignored runtime output).
+- `tests/` — torch-free offline unit tests.
diff --git a/custom_routers/fusion_gate/__init__.py b/custom_routers/fusion_gate/__init__.py
new file mode 100644
index 0000000..09f5c2d
--- /dev/null
+++ b/custom_routers/fusion_gate/__init__.py
@@ -0,0 +1,34 @@
+"""fusion_gate — route-vs-fuse meta-router plugin for LLMRouter.
+
+Auto-discovered from ./custom_routers/ . See router.py for the entry point.
+
+``FusionGateRouter`` is imported LAZILY (PEP 562 ``__getattr__``) rather than
+eagerly: ``router.py`` pulls in torch (MetaRouter subclasses ``nn.Module``), and
+an eager import here would force torch to load whenever this package is merely
+*resolved* — which pytest does for every test module under ``tests/`` while
+walking the package hierarchy. That made the four torch-free test modules
+uncollectable under the standard ``pytest custom_routers/fusion_gate/tests/``
+invocation (ModuleNotFoundError: No module named 'torch'). Deferring the import
+to first attribute access keeps package resolution torch-free while still
+exposing ``FusionGateRouter`` as a top-level name when it is actually used.
+"""
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:  # import for type-checkers only; not executed at runtime
+    from .router import FusionGateRouter
+
+__all__ = ["FusionGateRouter"]
+
+
+def __getattr__(name: str) -> Any:
+    """Lazily import ``FusionGateRouter`` on first access (PEP 562).
+
+    torch (a transitive dependency of ``router.py``) is loaded only when the
+    router is actually requested, not at package-collection time.
+    """
+    if name == "FusionGateRouter":
+        from .router import FusionGateRouter
+
+        return FusionGateRouter
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/custom_routers/fusion_gate/capability.py b/custom_routers/fusion_gate/capability.py
new file mode 100644
index 0000000..dbfefbc
--- /dev/null
+++ b/custom_routers/fusion_gate/capability.py
@@ -0,0 +1,390 @@
+"""CapabilityScorer — capability-scored fusion panel selection (UMB-123).
+
+Scores each candidate model against a query and returns the top-k panel that
+maps to the OpenRouter ``openrouter:fusion`` tool's ``analysis_models``. Panel
+membership varies by **query type**: a code/math/reasoning query and a general
+query draw on different per-category performance, so they generally produce
+different panels.
+
+Capability source (offline, no network):
+  - LLMRouter per-model routing performance — the ``routing_data_*`` tables that
+    ``MetaRouter``'s ``DataLoader`` attaches to the router (a pandas DataFrame or
+    a list of row dicts). Each row carries ``task_name`` / ``model_name`` /
+    ``performance``; we bucket ``task_name`` into a small set of query
+    categories and aggregate mean performance per ``(category, model)``.
+  - The ``feature`` text and prices in ``default_llm.json`` provide a deterministic
+    secondary signal (capability prior from model size/feature wording, lightly
+    cost-penalized) so scoring still differentiates models for categories the
+    routing table does not cover.
+
+Fallback contract (UMB-123): when no usable capability data is available for a
+query's category, ``select_panel`` returns ``None`` so the caller falls back to
+the configured ``panel_preset`` (Quality / Budget). The presets are also defined
+here so the router/executor share one source of truth.
+
+The scorer is pure data-in / list-out and imports no torch, keeping it fully
+offline and unit-testable with small in-memory fixtures.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Iterable, Literal
+
+QueryCategory = Literal["code", "math", "reasoning", "general"]
+
+# --- query-type detection (deterministic, documented) ------------------------
+# Mirrors the gate's code/math markers but resolves a single *category* label so
+# panel selection can be category-specific. Order matters: the first matching
+# category wins, with "general" as the catch-all.
+_CODE_KEYWORDS = (
+    "code",
+    "function",
+    "compile",
+    "debug",
+    "regex",
+    "program",
+    "python",
+    "javascript",
+    "bug",
+)
+_CODE_SYMBOLS = ("```", "def ", "class ", "{", "}", ";", "=>", "->")
+_MATH_KEYWORDS = (
+    "integral",
+    "derivative",
+    "theorem",
+    "proof",
+    "equation",
+    "matrix",
+    "algebra",
+    "calculus",
+    "probability",
+    "geometry",
+)
+_MATH_SYMBOLS = ("∫", "∑", "√", "^", "\\")
+_REASONING_KEYWORDS = (
+    "algorithm",
+    "complexity",
+    "reason",
+    "logic",
+    "deduce",
+    "puzzle",
+    "explain why",
+    "step by step",
+    "strategy",
+    "plan",
+)
+
+# Mapping from routing-data ``task_name`` substrings to a query category. The
+# example routing data uses task names like ``agentverse-logicgrid``; this lets
+# the per-category aggregation align with the query-type detector above.
+_TASK_CATEGORY_PATTERNS: tuple[tuple[str, QueryCategory], ...] = (
+    ("logic", "reasoning"),
+    ("reason", "reasoning"),
+    ("grid", "reasoning"),
+    ("puzzle", "reasoning"),
+    ("math", "math"),
+    ("gsm", "math"),
+    ("arithmetic", "math"),
+    ("algebra", "math"),
+    ("code", "code"),
+    ("humaneval", "code"),
+    ("mbpp", "code"),
+    ("program", "code"),
+)
+
+# Built-in presets used as the fallback panel when capability data is missing.
+# These are *labels*, not model names: the scorer resolves them against the
+# candidate set by price (cheapest-N for Budget, most-capable-N for Quality).
+PRESET_QUALITY = "Quality"
+PRESET_BUDGET = "Budget"
+
+
+class CapabilityScorer:
+    """Score candidate models per query and pick a top-k fusion panel.
+
+    Args:
+        llm_data: name -> candidate-metadata mapping (from default_llm.json),
+            carrying ``feature`` text and ``input_price`` / ``output_price``.
+        routing_data: optional per-model performance source — a pandas DataFrame
+            or an iterable of row dicts with ``task_name`` / ``model_name`` /
+            ``performance`` keys. When ``None`` or empty, capability scoring
+            falls back to the static prior derived from ``llm_data``.
+    """
+
+    def __init__(
+        self,
+        llm_data: dict[str, Any],
+        routing_data: Any = None,
+    ):
+        self.llm_data = llm_data
+        self.llm_names = list(llm_data.keys())
+        # category -> {model_name -> mean performance in roughly [0, 1]}
+        self._perf_by_category: dict[QueryCategory, dict[str, float]] = (
+            self._aggregate_performance(routing_data)
+        )
+
+    # ----------------------------------------------------------- public API
+
+    def select_panel(self, query: str, k: int) -> list[str] | None:
+        """Return the capability-scored top-k panel for ``query``.
+
+        The query is classified into a category (code/math/reasoning/general);
+        models are scored for that category and the top ``k`` by score are
+        returned. Returns ``None`` when no usable capability data exists for the
+        category, signalling the caller to fall back to ``panel_preset``.
+
+        Args:
+            query: Raw query text.
+            k: Panel size (maps to the fusion tool's ``analysis_models`` length).
+
+        Returns:
+            A list of up to ``k`` candidate model names, or ``None`` to trigger
+            the preset fallback.
+        """
+        if k <= 0 or not self.llm_names:
+            return None
+
+        category = self.classify_query(query)
+        scores = self._score_models(category)
+        if scores is None:
+            return None
+
+        ranked = sorted(
+            self.llm_names,
+            key=lambda name: (scores.get(name, 0.0), name),
+            reverse=True,
+        )
+        return ranked[:k]
+
+    def preset_panel(self, preset: str, k: int) -> list[str]:
+        """Resolve a named preset (Quality / Budget) to a top-k panel.
+
+        Quality => the ``k`` most-capable candidates (price-as-capability proxy,
+        descending). Budget => the ``k`` cheapest candidates. Any unrecognized
+        preset is treated as Quality. Used as the fallback when capability data
+        is unavailable.
+        """
+        if k <= 0 or not self.llm_names:
+            return []
+
+        by_price_desc = sorted(
+            self.llm_names, key=lambda name: (self._price(name), name), reverse=True
+        )
+        if str(preset).lower() == PRESET_BUDGET.lower():
+            cheapest = sorted(self.llm_names, key=lambda name: (self._price(name), name))
+            return cheapest[:k]
+        return by_price_desc[:k]
+
+    def classify_query(self, query: str) -> QueryCategory:
+        """Classify a query into a coarse capability category.
+
+        Deterministic precedence: code, then math, then reasoning, else general.
+        Kept pure (text in, label out) for unit testing.
+        """
+        if not query:
+            return "general"
+        lowered = query.lower()
+
+        if self._matches(lowered, query, _CODE_KEYWORDS, _CODE_SYMBOLS):
+            return "code"
+        if self._matches(lowered, query, _MATH_KEYWORDS, _MATH_SYMBOLS):
+            return "math"
+        if any(keyword in lowered for keyword in _REASONING_KEYWORDS):
+            return "reasoning"
+        return "general"
+
+    # ----------------------------------------------------------- scoring
+
+    def _score_models(self, category: QueryCategory) -> dict[str, float] | None:
+        """Build a per-model score map for a category, or ``None`` if unusable.
+
+        Combines two signals:
+          1. Empirical per-category performance from the routing data (primary).
+          2. A static prior from ``llm_data`` (feature/size wording, lightly
+             cost-penalized) so models absent from the routing table for this
+             category still rank relative to one another.
+
+        Returns ``None`` only when *neither* signal yields any differentiation
+        (no routing data for the category AND no llm_data prior), which is the
+        fallback trigger for ``select_panel``.
+        """
+        empirical = self._perf_by_category.get(category, {})
+        prior = self._static_prior()
+
+        if not empirical and not prior:
+            return None
+
+        scores: dict[str, float] = {}
+        for name in self.llm_names:
+            emp = empirical.get(name)
+            pri = prior.get(name, 0.0)
+            if emp is not None:
+                # Empirical performance dominates; the prior breaks ties and
+                # ranks models the routing table did not cover for this category.
+                scores[name] = 0.8 * emp + 0.2 * pri
+            else:
+                scores[name] = pri
+        return scores
+
+    def _static_prior(self) -> dict[str, float]:
+        """Capability prior in [0, 1] from llm_data feature text and price.
+
+        Heuristic and deterministic: larger / more-capable wording and higher
+        price correlate with capability in the candidate set, but cost is lightly
+        penalized so two models with similar capability favor the cheaper one.
+        Returns an empty map when ``llm_data`` is empty.
+        """
+        if not self.llm_names:
+            return {}
+
+        prices = [self._price(name) for name in self.llm_names]
+        max_price = max(prices) if prices else 0.0
+
+        prior: dict[str, float] = {}
+        for name in self.llm_names:
+            info = self.llm_data.get(name, {})
+            capability = self._feature_capability(info)
+            price = self._price(name)
+            # Normalize price to [0, 1]; subtract a small cost penalty.
+            norm_price = (price / max_price) if max_price > 0 else 0.0
+            prior[name] = self._clamp(capability - 0.1 * norm_price)
+        return prior
+
+    def _feature_capability(self, info: dict[str, Any]) -> float:
+        """Estimate capability in [0, 1] from a candidate's size/feature text.
+
+        Uses the model ``size`` (parameter count) when parseable, else falls
+        back to capability-suggestive wording in the ``feature`` blurb. Both are
+        normalized into [0, 1]; deterministic and offline.
+        """
+        size_score = self._size_score(info.get("size"))
+        if size_score is not None:
+            return size_score
+
+        feature = str(info.get("feature", "")).lower()
+        strong_markers = (
+            "powerful",
+            "high-accuracy",
+            "exceptional",
+            "advanced",
+            "complex",
+            "large-scale",
+        )
+        hits = sum(1 for marker in strong_markers if marker in feature)
+        return self._clamp(hits / 3.0)
+
+    @staticmethod
+    def _size_score(size: Any) -> float | None:
+        """Parse a parameter-count string (e.g. ``"49B"``) into a [0, 1] score.
+
+        Normalized by a 200B saturation point so the example candidate set
+        (7B..141B) spreads across the range. Returns ``None`` when unparseable.
+        """
+        if size is None:
+            return None
+        match = re.match(r"\s*([\d.]+)\s*([bBmM]?)", str(size))
+        if not match:
+            return None
+        try:
+            value = float(match.group(1))
+        except ValueError:
+            return None
+        unit = match.group(2).lower()
+        billions = value / 1000.0 if unit == "m" else value
+        return CapabilityScorer._clamp(billions / 200.0)
+
+    # ----------------------------------------------------------- aggregation
+
+    def _aggregate_performance(
+        self, routing_data: Any
+    ) -> dict[QueryCategory, dict[str, float]]:
+        """Aggregate mean performance per (category, model) from routing data.
+
+        Accepts a pandas DataFrame or an iterable of row dicts. Rows missing the
+        required keys are skipped. ``task_name`` is bucketed into a query
+        category; ``performance`` values are averaged per (category, model).
+        Returns an empty mapping when no usable rows are present.
+        """
+        rows = self._iter_rows(routing_data)
+        # category -> model -> [running_sum, count]
+        accum: dict[QueryCategory, dict[str, list[float]]] = {}
+
+        for row in rows:
+            model = row.get("model_name")
+            perf = row.get("performance")
+            task = row.get("task_name")
+            if model is None or perf is None:
+                continue
+            try:
+                perf_value = float(perf)
+            except (TypeError, ValueError):
+                continue
+            category = self._task_to_category(task)
+            bucket = accum.setdefault(category, {})
+            entry = bucket.setdefault(str(model), [0.0, 0.0])
+            entry[0] += perf_value
+            entry[1] += 1.0
+
+        result: dict[QueryCategory, dict[str, float]] = {}
+        for category, models in accum.items():
+            result[category] = {
+                name: (total / count) if count else 0.0
+                for name, (total, count) in models.items()
+            }
+        return result
+
+    @staticmethod
+    def _iter_rows(routing_data: Any) -> Iterable[dict[str, Any]]:
+        """Yield row dicts from a DataFrame or an iterable of dicts.
+
+        DataFrames are detected by duck-typing ``to_dict`` (pandas) so this
+        module never imports pandas. Anything else is treated as an iterable of
+        mapping-like rows; non-mappings are ignored.
+        """
+        if routing_data is None:
+            return []
+        # pandas DataFrame: convert to list-of-dicts without importing pandas.
+        if hasattr(routing_data, "to_dict"):
+            try:
+                return routing_data.to_dict(orient="records")
+            except TypeError:
+                return []
+        if isinstance(routing_data, dict):
+            return []
+        try:
+            return [row for row in routing_data if isinstance(row, dict)]
+        except TypeError:
+            return []
+
+    @staticmethod
+    def _task_to_category(task_name: Any) -> QueryCategory:
+        """Bucket a routing-data ``task_name`` into a query category."""
+        if not task_name:
+            return "general"
+        lowered = str(task_name).lower()
+        for pattern, category in _TASK_CATEGORY_PATTERNS:
+            if pattern in lowered:
+                return category
+        return "general"
+
+    # ----------------------------------------------------------- utilities
+
+    def _price(self, name: str) -> float:
+        """Per-model unit price (input + output) from llm_data."""
+        info = self.llm_data.get(name, {})
+        return float(info.get("input_price", 0.0)) + float(info.get("output_price", 0.0))
+
+    @staticmethod
+    def _matches(
+        lowered: str, raw: str, keywords: tuple[str, ...], symbols: tuple[str, ...]
+    ) -> bool:
+        """True when any keyword (lowercased) or raw symbol is present."""
+        if any(keyword in lowered for keyword in keywords):
+            return True
+        return any(symbol in raw for symbol in symbols)
+
+    @staticmethod
+    def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
+        """Clamp ``value`` into [low, high]."""
+        return max(low, min(high, value))
diff --git a/custom_routers/fusion_gate/config.yaml b/custom_routers/fusion_gate/config.yaml
new file mode 100644
index 0000000..4b66814
--- /dev/null
+++ b/custom_routers/fusion_gate/config.yaml
@@ -0,0 +1,58 @@
+# FusionGateRouter config — see fusion-gate-router-prd-v0.2.0.md
+#
+# Plugin path: custom_routers/fusion_gate/config.yaml
+# Use:  llmrouter infer --router fusion_gate --config custom_routers/fusion_gate/config.yaml --query "..."
+# Route-only (no API call / no spend):  ... --route-only
+
+data_path:
+  # LLM candidate metadata (name, service, model, prices, api_endpoint).
+  # OpenRouter models should set "service": "OpenRouter" so API_KEYS resolves the key.
+  llm_data: 'data/example_data/llm_candidates/default_llm.json'
+
+  # Per-model routing performance — the capability source for UMB-123 panel
+  # selection. Loaded by MetaRouter's DataLoader (mirrors randomrouter/config.yaml).
+  query_data_test: 'data/example_data/query_data/default_query_test.jsonl'
+  routing_data_test: 'data/example_data/routing_data/default_routing_test_data.jsonl'
+
+# Metric weights (optional, for evaluation; mirrors randomrouter/config.yaml).
+metric:
+  weights:
+    performance: 1
+    cost: 0
+    llm_judge: 0
+
+hparam:
+  # --- gate (UMB-119) ---
+  threshold: 0.5          # difficulty cutoff to escalate -> full Quality fusion
+
+  # --- three-tier dial (UMB-124) ---
+  # Lower boundary for the middle tier. difficulty in [budget_threshold, threshold)
+  # routes to a cheap Budget fusion panel; >= threshold routes to the full Quality
+  # panel; < budget_threshold routes single. Set to null to disable the mid tier.
+  budget_threshold: 0.3
+
+  # --- panel selection (UMB-123) ---
+  k: 3                    # panel size -> maps to openrouter:fusion `analysis_models`
+  judge: null             # judge slug -> maps to tool `model`; null = outer model
+  panel_preset: 'Quality' # fallback when capability data unavailable: Quality | Budget
+
+  # --- cost guard (UMB-121) ---
+  # Hard per-query DOLLAR cap on the projected Σ(panel)+judge cost; null = off.
+  # e.g. 0.05 ≈ five cents per query. Projection:
+  #   (input_price*prompt_tokens + output_price*completion_tokens)/1e6 per member,
+  # with prompt_tokens ≈ len(query)//4 and completion_tokens = est_completion_tokens.
+  cost_ceiling: null
+  est_completion_tokens: 512  # per-completion output-token estimate for the projection
+
+  # --- provider / endpoint (UMB-121) ---
+  # base_url is the OpenRouter endpoint hosting the beta server tool; provider is
+  # informational (key resolution). base_url overrides the top-level api_endpoint.
+  provider: 'OpenRouter'
+  base_url: 'https://openrouter.ai/api/v1'
+
+  # Optional JSONL sink for fusion-call logging (UMB-125). null = fusion_log default.
+  log_sink_path: null
+
+# OpenRouter endpoint (server tools live here). Per-model endpoints in the
+# candidate JSON override this. Key supplied via API_KEYS '{"OpenRouter": "..."}'.
+api_endpoint: 'https://openrouter.ai/api/v1'
diff --git a/custom_routers/fusion_gate/eval/RESULTS.md b/custom_routers/fusion_gate/eval/RESULTS.md
new file mode 100644
index 0000000..dac3022
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/RESULTS.md
@@ -0,0 +1,107 @@
+# FusionGateRouter — eval harness results
+
+> **These numbers are from MOCK fixtures (deterministic stub executor, zero spend).**
+> They validate the harness wiring and metric math, NOT real model quality.
+> **Real M1–M4 numbers require a keyed live run** (`OPENROUTER_API_KEY` / `API_KEYS`
+> set) against a real benchmark slice — see the *Live run* section below.
+
+This file is the **committed, intentional** eval report. The harness also writes a
+fresh `results.csv` / `results.md` into the gitignored `eval/out/` directory on every
+run; those are runtime output and are never tracked. Regenerate the numbers below
+with:
+
+```bash
+python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \
+  --out custom_routers/fusion_gate/eval/out
+```
+
+- Source: MOCK fixtures (zero spend)
+- Dataset: `eval/fixtures/hard_slice.jsonl` (16 held-out queries; GSM8K / MATH / GPQA / MBPP)
+
+## Slice definitions
+
+The dataset mixes EASY and HARD queries (6 easy, 10 hard). Two distinct slices are
+used so the metrics are comparable across arms:
+
+- **Full dataset (16 queries)** — drives Quality, Blended cost, and Escalation `p`.
+  Every arm is scored over all 16 records.
+- **Hard slice (10 queries)** — the fixed, arm-independent set used for the **M3
+  gate-precision** metric. A record is *hard* when its `id` carries the `-hard-`
+  marker (e.g. `gsm8k-hard-01`); an explicit `difficulty: "hard"` field overrides
+  the id heuristic when present. See `eval_harness.is_hard_record`.
+
+**Why the hard slice matters for M3 (apples-to-apples):** M3 asks "among escalated
+queries, how often does fusion beat the best single answer?" The `always_fuse` arm
+escalates *every* query (easy + hard) while the `fusion_gate` arm escalates *only the
+hard ones*. Scoring M3 over each arm's own escalation set would give the two arms
+different denominators (16 vs 10) and the numbers would not be comparable. M3 is
+therefore computed over the **same hard slice for every arm**. `always_route` makes
+no escalation decision, so its M3 is **N/A** (undefined).
+
+## Per-arm metrics
+
+Quality / Blended cost / Escalation `p` are over the full 16-query dataset; M3 is over
+the 10-query hard slice.
+
+| Arm | n | Quality | Blended cost ($/query) | Escalation p | Gate-precision (M3, hard slice) |
+|-----|---|---------|------------------------|--------------|---------------------------------|
+| always_route | 16 | 0.3750 | 0.000650 | 0.0000 | n/a |
+| always_fuse  | 16 | 1.0000 | 0.001137 | 1.0000 | 1.0000 |
+| fusion_gate  | 16 | 1.0000 | 0.000767 | 0.6250 | 1.0000 |
+
+Blended cost is an estimated **per-query dollar** amount: for each panel member + judge,
+`(input_price · prompt_tokens + output_price · completion_tokens) / 1e6`, with
+`input_price` / `output_price` the per-million-token prices from `llm_data`,
+`prompt_tokens ≈ len(query) // 4`, and `completion_tokens = est_completion_tokens`
+(default 512). This is the same projection the `cost_ceiling` guard compares against,
+so `cost_ceiling` is set in dollars per query.
+
+## Metric targets
+
+- **M1** — gate quality ≥ 95% of always-fuse quality (hard slice): gate quality 1.0000
+  vs target 0.9500 (95% of always-fuse 1.0000); ratio 1.0000 → **PASS**.
+- **M2** — blended cost ≤ 1.6× always-route: gate cost 0.000767 vs target 0.001039
+  (1.6× always-route 0.000650); ratio 1.1802 → **PASS**.
+- **M3** — gate-precision over the hard slice (escalated answers that beat best single):
+  fusion_gate 1.0000 (10/10), always_fuse 1.0000 (10/10) → **measured** (same slice for
+  both arms; always_route N/A).
+
+## Retrain: gate-precision before vs after
+
+- Source: MOCK fixtures (synthesized fusion log, zero spend)
+- Replayed 16 fusion-log entries → 32 graded training rows.
+- Routing table augmented: 28 → 60 rows.
+- Gate threshold refit: 0.400 → 0.520.
+- Gate budget_threshold refit: 0.100 → 0.180 (raised so wasted low-difficulty
+  escalations route single).
+
+| Metric | Before | After | Delta |
+|--------|--------|-------|-------|
+| M3 gate-precision (hard slice) | 1.0000 | 1.0000 | +0.0000 |
+| Escalated (hard slice) | 10 | 10 | +0 |
+| Escalated-and-improved | 10 | 10 | +0 |
+
+> With M3 scored over the fixed hard slice, the mock retrain holds gate-precision at
+> 1.0000 (it no longer benefits from the prior easy/hard denominator mismatch). The
+> real M3 delta (M4) requires a keyed live run replaying a real fusion-log sink.
+
+## Live run (keyed, real spend)
+
+The committed numbers above are from MOCK fixtures and a deterministic stub executor —
+**zero spend, no network**. To produce real M1–M4 numbers you must run keyed against
+real models:
+
+```bash
+# 1. Provide an OpenRouter key (never commit it):
+export OPENROUTER_API_KEY=sk-...        # or: export API_KEYS='{"OpenRouter": "sk-..."}'
+
+# 2. Build the real router from the plugin config and route+fuse a
+#    real benchmark slice (GSM8K/MATH/GPQA/MBPP), scoring answers with
+#    llmrouter/data/api_calling_evaluation.eval_perf. The real
+#    FusionGateRouter + FusionExecutor make the openrouter:fusion calls;
+#    all OpenRouter HTTP specifics stay inside executor.py.
+#    (This offline harness does NOT make live calls by design.)
+```
+
+M4 (the offline log→retrain quality delta) is produced by `retrain.py`; its mock delta
+is reported above when `--with-retrain` is passed.
diff --git a/custom_routers/fusion_gate/eval/__init__.py b/custom_routers/fusion_gate/eval/__init__.py
new file mode 100644
index 0000000..e963f34
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/__init__.py
@@ -0,0 +1,13 @@
+"""fusion_gate.eval — offline eval + retrain harness for FusionGateRouter.
+
+This package contains the route-vs-fuse evaluation harness (UMB-122/124) and the
+scripted retrain loop (UMB-126). Both run fully offline in ``--mock`` mode against
+the bundled fixtures under ``fixtures/`` and spend nothing; a keyed live-run path
+is documented in ``results.md`` and in each module's docstring.
+
+Design constraint: nothing here imports torch or pandas. The harness composes the
+torch-free seams of the plugin directly — :class:`RouteGate`, :class:`CapabilityScorer`,
+:class:`FusionExecutor` / a deterministic mock stub, and ``fusion_log`` — mirroring
+what :class:`FusionGateRouter` wires internally, so the harness is importable and
+testable with only the standard library.
+"""
diff --git a/custom_routers/fusion_gate/eval/eval_harness.py b/custom_routers/fusion_gate/eval/eval_harness.py
new file mode 100644
index 0000000..c238668
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/eval_harness.py
@@ -0,0 +1,813 @@
+"""eval_harness — three-arm route-vs-fuse evaluation (UMB-122, UMB-124).
+
+Compares three strategies over a held-out HARD-query slice drawn from the
+LLMRouter benchmark families (GSM8K / MATH / GPQA / MBPP):
+
+  * ``always_route``  — baseline: every query takes the cheap single-model path
+                        (the gate's cheapest-capable single pick). One model call.
+  * ``always_fuse``   — every query takes the full Quality fusion panel.
+  * ``fusion_gate``   — the FusionGateRouter decision: gate each query between the
+                        single path and a fusion tier, fusing only the hard ones.
+
+Per arm it captures:
+
+  * quality score  — mean correctness of the chosen answer vs ground truth.
+  * blended cost   — mean projected $ per query (single = one model; fusion =
+                     Σ(panel)+judge, from the executor's ``project_cost``).
+  * escalation rate ``p`` — fraction of queries the arm sent to a fusion tier.
+  * gate-precision (M3, UMB-124) — among ESCALATED queries, the fraction whose
+                     synthesized fusion answer beats the best single-model answer.
+
+Metric targets reported against the baselines:
+
+  * M1: fusion-gate quality >= 95% of always-fuse quality on the hard slice.
+  * M2: fusion-gate blended cost <= 1.6x always-route blended cost.
+  * M3: gate-precision (escalated-and-improved) — reported per UMB-124.
+
+OFFLINE / ZERO-SPEND (``--mock``, the default): a deterministic stub executor
+(:class:`MockFusionExecutor`) reads canned per-model answers from the bundled
+fixture (``fixtures/hard_slice.jsonl``); NO network call is made and nothing is
+spent. The harness composes the plugin's torch-free seams (``RouteGate``,
+``CapabilityScorer``, ``FusionExecutor`` projection, ``fusion_log``) directly,
+mirroring what ``FusionGateRouter`` wires internally — it never imports torch.
+
+LIVE RUN (keyed, real spend — documented, not the default): construct the real
+``FusionGateRouter`` from ``custom_routers/fusion_gate/config.yaml`` and call its
+``route_single`` / ``fuse`` with ``OPENROUTER_API_KEY`` (or ``API_KEYS``) set,
+over a real benchmark slice. See ``results.md`` and ``--help``. The live path is
+intentionally NOT wired into this offline harness so a stray run cannot spend.
+
+Usage (offline)::
+
+    python -m custom_routers.fusion_gate.eval.eval_harness --mock \
+        --out custom_routers/fusion_gate/eval/out
+
+Outputs: ``<out>/results.csv`` (per-arm rows) and ``<out>/results.md`` (report).
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import importlib.util
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Iterable
+
+# --- offline, torch-free imports -------------------------------------------
+# Load the plugin's torch-free modules directly by file path so importing this
+# harness never triggers the package __init__ (which imports torch via router.py).
+_PLUGIN_DIR = Path(__file__).resolve().parents[1]
+_FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
+
+
+def _load_module(name: str, filename: str):
+    """Load a sibling plugin module by file path (no package import side effects)."""
+    path = _PLUGIN_DIR / filename
+    spec = importlib.util.spec_from_file_location(name, str(path))
+    if spec is None or spec.loader is None:  # pragma: no cover - defensive
+        raise ImportError(f"cannot load {filename}")
+    module = importlib.util.module_from_spec(spec)
+    # Register before exec so dataclass field types in the module resolve.
+    import sys
+
+    sys.modules[name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+_gate = _load_module("fusion_gate_eval_gate", "gate.py")
+_capability = _load_module("fusion_gate_eval_capability", "capability.py")
+_executor = _load_module("fusion_gate_eval_executor", "executor.py")
+
+RouteGate = _gate.RouteGate
+GateDecision = _gate.GateDecision
+FUSION_TIERS = _gate.FUSION_TIERS
+TIER_TO_PRESET = _gate.TIER_TO_PRESET
+resolve_preset = _gate.resolve_preset
+CapabilityScorer = _capability.CapabilityScorer
+FusionExecutor = _executor.FusionExecutor
+FusionResult = _executor.FusionResult
+
+
+# ---------------------------------------------------------------------------
+# Fixture loading
+# ---------------------------------------------------------------------------
+
+
+def load_jsonl(path: str | Path) -> list[dict[str, Any]]:
+    """Read a JSONL file into a list of dicts (skips blank lines)."""
+    rows: list[dict[str, Any]] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+
+
+def load_llm_candidates(path: str | Path) -> dict[str, Any]:
+    """Read the candidate-metadata JSON (default_llm.json shape)."""
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+# ---------------------------------------------------------------------------
+# Deterministic mock executor (zero spend)
+# ---------------------------------------------------------------------------
+
+
+class MockFusionExecutor:
+    """Deterministic, offline stand-in for :class:`FusionExecutor`.
+
+    Mirrors the real executor's ``run`` signature and returns a real
+    :class:`FusionResult`, but instead of an OpenRouter HTTP call it synthesizes
+    the panel ``responses[]`` and the fused ``answer`` from canned per-record
+    fixture data — so the harness exercises the full route→fuse→log flow with
+    ZERO spend and no network. Cost is taken from the real ``project_cost`` so
+    the blended-cost metric stays faithful to the live cost model.
+
+    The mock NEVER touches OpenRouter HTTP specifics; all such logic stays in
+    ``executor.py`` per the plugin's beta-tool isolation rule. This class only
+    fills ``FusionResult`` fields a live call would populate.
+    """
+
+    def __init__(self, llm_data: dict[str, Any], records_by_query: dict[str, dict[str, Any]]):
+        self.llm_data = llm_data
+        self._by_query = records_by_query
+        # Reuse the real projector for faithful cost accounting (no network).
+        self._projector = FusionExecutor(llm_data=llm_data)
+
+    def project_cost(
+        self,
+        panel: list[str],
+        judge: str | None,
+        query: str | None = None,
+        prompt_tokens: int | None = None,
+    ) -> float:
+        """Delegate to the real per-query dollar cost projection (Σ panel + judge)."""
+        return self._projector.project_cost(
+            panel, judge, query=query, prompt_tokens=prompt_tokens
+        )
+
+    def run(
+        self,
+        query: str,
+        panel: list[str],
+        judge: str | None = None,
+        api_keys: dict[str, str] | None = None,
+        **gen_kwargs: Any,
+    ) -> FusionResult:
+        """Synthesize a FusionResult from fixture data — no network, no spend."""
+        record = self._by_query.get(query, {})
+        single_answers: dict[str, str] = record.get("single_answers", {})
+        responses = [
+            {"model": name, "content": single_answers.get(name, "")}
+            for name in panel
+        ]
+        # The fixture carries the judge's synthesized answer for hard queries.
+        fused = record.get("fusion_answer", "")
+        cost = self.project_cost(panel, judge, query=query)
+        return FusionResult(
+            answer=fused,
+            analysis={"consensus": fused, "contradictions": [], "blind_spots": []},
+            responses=responses,
+            panel=list(panel),
+            judge=judge,
+            cost=cost,
+            raw=None,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+
+
+def normalize_answer(answer: Any) -> str:
+    """Normalize an answer for exact comparison (offline-safe, deterministic).
+
+    Lowercased, stripped, with surrounding whitespace/punctuation removed. Kept
+    intentionally simple: the bundled fixtures use clean canonical answers so a
+    light normalization suffices for the mock metrics. The live path would defer
+    to ``llmrouter/data/api_calling_evaluation.eval_perf`` for benchmark-grade
+    scoring (GSM8K / MATH / code-exec).
+    """
+    text = str(answer).strip().lower()
+    return text.strip(" .$\t\n")
+
+
+def score_answer(prediction: Any, ground_truth: Any) -> float:
+    """Binary correctness in {0.0, 1.0} via normalized exact match."""
+    return 1.0 if normalize_answer(prediction) == normalize_answer(ground_truth) else 0.0
+
+
+def is_hard_record(record: dict[str, Any]) -> bool:
+    """True if a fixture record belongs to the HARD slice.
+
+    The hard slice is the fixed, arm-independent set of records the harness uses
+    for the M3 gate-precision metric, so M3 is computed over the SAME slice for
+    every arm (apples-to-apples). A record is hard when its ``id`` carries the
+    ``-hard-`` marker (e.g. ``gsm8k-hard-01``), with a ``difficulty == "hard"``
+    field honored as an explicit override when present.
+    """
+    explicit = record.get("difficulty")
+    if explicit is not None:
+        return str(explicit).lower() == "hard"
+    return "-hard-" in str(record.get("id", ""))
+
+
+def best_single_answer(record: dict[str, Any]) -> str:
+    """The best single-model answer for a record.
+
+    Prefers the explicit ``single_best_answer`` field; otherwise picks the most
+    common answer across ``single_answers`` (majority vote), ties broken by the
+    answer that matches ground truth when present.
+    """
+    explicit = record.get("single_best_answer")
+    if explicit is not None:
+        return str(explicit)
+    answers = list(record.get("single_answers", {}).values())
+    if not answers:
+        return ""
+    gt = record.get("ground_truth")
+    # Majority vote; prefer a correct answer on ties.
+    counts: dict[str, int] = {}
+    for a in answers:
+        counts[str(a)] = counts.get(str(a), 0) + 1
+    best = max(
+        counts,
+        key=lambda a: (counts[a], 1 if gt is not None and score_answer(a, gt) else 0),
+    )
+    return best
+
+
+# ---------------------------------------------------------------------------
+# Arm results
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ArmResult:
+    """Aggregate metrics for one evaluation arm."""
+
+    arm: str
+    n: int = 0
+    quality: float = 0.0          # mean correctness in [0, 1]
+    blended_cost: float = 0.0     # mean projected $ per query
+    escalation_p: float = 0.0     # fraction routed to a fusion tier
+    gate_precision: float | None = None  # M3 (UMB-124): None when undefined
+    n_escalated: int = 0
+    n_escalated_improved: int = 0
+    per_query: list[dict[str, Any]] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Harness
+# ---------------------------------------------------------------------------
+
+
+class EvalHarness:
+    """Three-arm route-vs-fuse evaluator over a hard-query slice.
+
+    Args:
+        dataset: list of fixture records (see ``fixtures/hard_slice.jsonl``).
+        llm_data: candidate-metadata mapping (default_llm.json shape).
+        routing_data: optional per-model routing performance rows for the
+            capability scorer (panel selection); list of dicts or None.
+        threshold / budget_threshold / k / judge / panel_preset: gate + panel
+            hyperparameters, mirroring the router's config keys.
+        executor: a run-able executor exposing ``run`` and ``project_cost``. In
+            mock mode this is a :class:`MockFusionExecutor`; a live run passes a
+            keyed :class:`FusionExecutor`.
+    """
+
+    def __init__(
+        self,
+        dataset: list[dict[str, Any]],
+        llm_data: dict[str, Any],
+        executor: Any,
+        routing_data: list[dict[str, Any]] | None = None,
+        threshold: float = 0.5,
+        budget_threshold: float | None = 0.3,
+        k: int = 3,
+        judge: str | None = None,
+        panel_preset: str = "Quality",
+    ):
+        self.dataset = dataset
+        self.llm_data = llm_data
+        self.executor = executor
+        self.k = k
+        self.judge = judge
+        self.panel_preset = panel_preset
+        self.gate = RouteGate(
+            llm_data=llm_data,
+            threshold=threshold,
+            budget_threshold=budget_threshold,
+        )
+        self.capability = CapabilityScorer(llm_data=llm_data, routing_data=routing_data)
+
+    # ----------------------------------------------------------- panel select
+
+    def _select_panel(self, query: str, tier: str) -> list[str]:
+        """Capability-scored top-k panel, preset fallback by tier (UMB-123/124)."""
+        panel = self.capability.select_panel(query, self.k)
+        if panel:
+            return panel
+        # Shared tier->preset resolution (gate.resolve_preset) so the harness and
+        # FusionGateRouter._select_panel cannot diverge.
+        preset = resolve_preset(tier, self.panel_preset)
+        return self.capability.preset_panel(preset, self.k)
+
+    def _quality_preset_panel(self) -> Callable[[str], list[str]]:
+        """Panel selector for the always-fuse arm (always the Quality preset)."""
+
+        def select(query: str) -> list[str]:
+            panel = self.capability.select_panel(query, self.k)
+            if panel:
+                return panel
+            return self.capability.preset_panel("Quality", self.k)
+
+        return select
+
+    # ----------------------------------------------------------- arms
+
+    def _best_single_model(self, query: str) -> str:
+        """The capability-best single model for a query (fair single-router pick).
+
+        Mirrors what a good classic single-model router would choose: the
+        top-1 capability-scored candidate for the query category, falling back to
+        the Quality preset head, then the cheapest model. Used by the
+        ``always_route`` baseline so it is a CAPABLE single-router, not a
+        cheapest-only strawman.
+        """
+        top = self.capability.select_panel(query, 1)
+        if top:
+            return top[0]
+        preset = self.capability.preset_panel("Quality", 1)
+        if preset:
+            return preset[0]
+        return self.gate.cheapest_model()
+
+    def run_always_route(self) -> ArmResult:
+        """Baseline arm: every query → its capability-best single model (one call)."""
+        res = ArmResult(arm="always_route", n=len(self.dataset))
+        total_q = 0.0
+        total_cost = 0.0
+        for record in self.dataset:
+            model = self._best_single_model(record["query"])
+            # Single-model answer for that model; fall back to best single answer.
+            ans = record.get("single_answers", {}).get(model)
+            if ans is None:
+                ans = best_single_answer(record)
+            q = score_answer(ans, record.get("ground_truth"))
+            cost = self.executor.project_cost([model], None, query=record["query"])
+            total_q += q
+            total_cost += cost
+            res.per_query.append(
+                {"id": record.get("id"), "arm": "always_route", "escalated": False,
+                 "model": model, "answer": ans, "quality": q, "cost": cost}
+            )
+        res.quality = total_q / res.n if res.n else 0.0
+        res.blended_cost = total_cost / res.n if res.n else 0.0
+        res.escalation_p = 0.0
+        return res
+
+    def run_always_fuse(self) -> ArmResult:
+        """Always-fuse arm: every query → full Quality fusion panel."""
+        res = ArmResult(arm="always_fuse", n=len(self.dataset))
+        select = self._quality_preset_panel()
+        total_q = 0.0
+        total_cost = 0.0
+        for record in self.dataset:
+            panel = select(record["query"])
+            result = self.executor.run(record["query"], panel, judge=self.judge)
+            q = score_answer(result.answer, record.get("ground_truth"))
+            cost = result.cost if result.cost is not None else self.executor.project_cost(panel, self.judge, query=record["query"])
+            total_q += q
+            total_cost += cost
+            res.per_query.append(
+                {"id": record.get("id"), "arm": "always_fuse", "escalated": True,
+                 "panel": panel, "answer": result.answer, "quality": q, "cost": cost}
+            )
+        res.quality = total_q / res.n if res.n else 0.0
+        res.blended_cost = total_cost / res.n if res.n else 0.0
+        res.escalation_p = 1.0
+        # Every query escalates: gate-precision over all of them (M3).
+        res.n_escalated, res.n_escalated_improved, res.gate_precision = self._gate_precision(
+            res.per_query, escalated_only=True
+        )
+        return res
+
+    def run_fusion_gate(self) -> ArmResult:
+        """Fusion-gate arm: gate each query single-vs-fuse, fuse only the hard ones."""
+        res = ArmResult(arm="fusion_gate", n=len(self.dataset))
+        total_q = 0.0
+        total_cost = 0.0
+        escalated = 0
+        for record in self.dataset:
+            query = record["query"]
+            decision: GateDecision = self.gate.decide({"query": query})
+            if decision.tier not in FUSION_TIERS:
+                # Single path.
+                model = decision.model_name or self.gate.cheapest_model()
+                ans = record.get("single_answers", {}).get(model)
+                if ans is None:
+                    ans = best_single_answer(record)
+                q = score_answer(ans, record.get("ground_truth"))
+                cost = self.executor.project_cost([model], None, query=query)
+                total_q += q
+                total_cost += cost
+                res.per_query.append(
+                    {"id": record.get("id"), "arm": "fusion_gate", "tier": decision.tier,
+                     "escalated": False, "model": model, "answer": ans, "quality": q,
+                     "cost": cost}
+                )
+                continue
+
+            # Fusion path.
+            escalated += 1
+            panel = self._select_panel(query, decision.tier)
+            result = self.executor.run(query, panel, judge=self.judge)
+            q = score_answer(result.answer, record.get("ground_truth"))
+            cost = result.cost if result.cost is not None else self.executor.project_cost(panel, self.judge, query=query)
+            total_q += q
+            total_cost += cost
+            res.per_query.append(
+                {"id": record.get("id"), "arm": "fusion_gate", "tier": decision.tier,
+                 "escalated": True, "panel": panel, "answer": result.answer,
+                 "quality": q, "cost": cost}
+            )
+        res.quality = total_q / res.n if res.n else 0.0
+        res.blended_cost = total_cost / res.n if res.n else 0.0
+        res.escalation_p = escalated / res.n if res.n else 0.0
+        res.n_escalated, res.n_escalated_improved, res.gate_precision = self._gate_precision(
+            res.per_query, escalated_only=True
+        )
+        return res
+
+    # ----------------------------------------------------------- M3 metric
+
+    def _gate_precision(
+        self, per_query: list[dict[str, Any]], escalated_only: bool
+    ) -> tuple[int, int, float | None]:
+        """Gate-precision (M3, UMB-124) — computed over the fixed HARD slice.
+
+        APPLES-TO-APPLES: M3 is scored over the SAME hard slice for every arm
+        (records flagged by ``is_hard_record``), not over each arm's own
+        escalation set. Without this, ``always_fuse`` (which "escalates" every
+        query, easy + hard) and ``fusion_gate`` (which escalates only the hard
+        ones) would compute M3 over different denominators and the numbers would
+        not be comparable.
+
+        Among the hard-slice queries an arm actually escalated, M3 is the
+        fraction whose synthesized fusion answer BEATS the best single-model
+        answer — i.e. the fusion answer is correct AND the best single answer is
+        not. Returns ``(n_escalated, n_escalated_improved, precision)``;
+        precision is ``None`` when the arm escalated no hard-slice query
+        (undefined — e.g. ``always_route``, which makes no escalation decision).
+        """
+        by_id = {r.get("id"): r for r in self.dataset}
+        n_esc = 0
+        n_improved = 0
+        for row in per_query:
+            # Every arm now stamps an explicit ``escalated`` bool on each row, so
+            # the M3 filter reads that field directly rather than inferring it
+            # from the arm name (which coupled this logic to a string constant).
+            if escalated_only and not row.get("escalated", False):
+                continue
+            record = by_id.get(row.get("id"))
+            if record is None:
+                continue
+            # Restrict to the fixed hard slice so the denominator is identical
+            # across arms.
+            if not is_hard_record(record):
+                continue
+            n_esc += 1
+            gt = record.get("ground_truth")
+            fusion_correct = score_answer(row.get("answer"), gt) >= 1.0
+            single_correct = score_answer(best_single_answer(record), gt) >= 1.0
+            # "Beats the best single answer": fusion right where best single wrong.
+            if fusion_correct and not single_correct:
+                n_improved += 1
+        precision = (n_improved / n_esc) if n_esc else None
+        return n_esc, n_improved, precision
+
+    # ----------------------------------------------------------- run all
+
+    def run_all(self) -> dict[str, ArmResult]:
+        """Run all three arms and return ``{arm_name: ArmResult}``."""
+        return {
+            "always_route": self.run_always_route(),
+            "always_fuse": self.run_always_fuse(),
+            "fusion_gate": self.run_fusion_gate(),
+        }
+
+
+# ---------------------------------------------------------------------------
+# Metric verdicts (M1 / M2 / M3)
+# ---------------------------------------------------------------------------
+
+
+def compute_verdicts(arms: dict[str, ArmResult]) -> dict[str, Any]:
+    """Compute the M1 / M2 / M3 pass-fail verdicts from arm results.
+
+    M1: fusion_gate.quality >= 0.95 * always_fuse.quality.
+    M2: fusion_gate.blended_cost <= 1.6 * always_route.blended_cost.
+    M3: fusion_gate.gate_precision (escalated-and-improved) — reported; the
+        target is informational (no hard threshold mandated by UMB-124 beyond
+        "measured"), so the verdict reports the value and flags > 0.0.
+    """
+    route = arms["always_route"]
+    fuse = arms["always_fuse"]
+    gate = arms["fusion_gate"]
+
+    m1_target = 0.95 * fuse.quality
+    m1_pass = gate.quality >= m1_target
+    m1_ratio = (gate.quality / fuse.quality) if fuse.quality > 0 else None
+
+    m2_target = 1.6 * route.blended_cost
+    m2_pass = gate.blended_cost <= m2_target
+    m2_ratio = (gate.blended_cost / route.blended_cost) if route.blended_cost > 0 else None
+
+    m3_value = gate.gate_precision
+    m3_pass = (m3_value is not None) and (m3_value > 0.0)
+
+    return {
+        "M1": {
+            "name": "gate quality >= 95% of always-fuse quality (hard slice)",
+            "gate_quality": gate.quality,
+            "always_fuse_quality": fuse.quality,
+            "target": m1_target,
+            "ratio": m1_ratio,
+            "pass": m1_pass,
+        },
+        "M2": {
+            "name": "blended cost <= 1.6x always-route",
+            "gate_cost": gate.blended_cost,
+            "always_route_cost": route.blended_cost,
+            "target": m2_target,
+            "ratio": m2_ratio,
+            "pass": m2_pass,
+        },
+        "M3": {
+            "name": "gate-precision: escalated answers that beat best single",
+            "gate_precision": m3_value,
+            "n_escalated": gate.n_escalated,
+            "n_escalated_improved": gate.n_escalated_improved,
+            "pass": m3_pass,
+        },
+    }
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+
+def write_results_csv(arms: dict[str, ArmResult], path: str | Path) -> None:
+    """Write the per-arm summary CSV."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(
+            ["arm", "n", "quality", "blended_cost", "escalation_p",
+             "gate_precision", "n_escalated", "n_escalated_improved"]
+        )
+        for arm in ("always_route", "always_fuse", "fusion_gate"):
+            r = arms[arm]
+            writer.writerow(
+                [r.arm, r.n, f"{r.quality:.4f}", f"{r.blended_cost:.6f}",
+                 f"{r.escalation_p:.4f}",
+                 "" if r.gate_precision is None else f"{r.gate_precision:.4f}",
+                 r.n_escalated, r.n_escalated_improved]
+            )
+
+
+def _fmt(value: Any, spec: str = ".4f") -> str:
+    if value is None:
+        return "n/a"
+    return format(value, spec)
+
+
+def render_results_md(
+    arms: dict[str, ArmResult],
+    verdicts: dict[str, Any],
+    *,
+    mock: bool,
+    dataset_path: str,
+    n: int,
+    retrain_block: str | None = None,
+) -> str:
+    """Render the human-readable results.md report."""
+    route = arms["always_route"]
+    fuse = arms["always_fuse"]
+    gate = arms["fusion_gate"]
+
+    source = "MOCK fixtures (zero spend)" if mock else "LIVE keyed run"
+    lines: list[str] = []
+    lines.append("# FusionGateRouter — eval harness results")
+    lines.append("")
+    if mock:
+        lines.append(
+            "> **These numbers are from MOCK fixtures (deterministic stub executor, "
+            "zero spend).** They validate the harness wiring and metric math, NOT "
+            "real model quality. **Real M1–M4 numbers require a keyed live run** "
+            "(`OPENROUTER_API_KEY` / `API_KEYS` set) against a real benchmark slice "
+            "— see the *Live run* section below."
+        )
+    else:
+        lines.append("> Numbers from a LIVE keyed run (real OpenRouter spend).")
+    lines.append("")
+    lines.append(f"- Source: {source}")
+    lines.append(f"- Hard slice: `{dataset_path}` ({n} held-out queries; GSM8K / MATH / GPQA / MBPP)")
+    lines.append("")
+    lines.append("## Per-arm metrics")
+    lines.append("")
+    lines.append("| Arm | n | Quality | Blended cost | Escalation p | Gate-precision (M3) |")
+    lines.append("|-----|---|---------|--------------|--------------|---------------------|")
+    for r in (route, fuse, gate):
+        lines.append(
+            f"| {r.arm} | {r.n} | {_fmt(r.quality)} | {_fmt(r.blended_cost, '.6f')} | "
+            f"{_fmt(r.escalation_p)} | {_fmt(r.gate_precision)} |"
+        )
+    lines.append("")
+    lines.append("## Metric targets")
+    lines.append("")
+    m1 = verdicts["M1"]
+    m2 = verdicts["M2"]
+    m3 = verdicts["M3"]
+    lines.append(
+        f"- **M1** — {m1['name']}: gate quality {_fmt(m1['gate_quality'])} vs "
+        f"target {_fmt(m1['target'])} (95% of always-fuse {_fmt(m1['always_fuse_quality'])}); "
+        f"ratio {_fmt(m1['ratio'])} → **{'PASS' if m1['pass'] else 'FAIL'}**."
+    )
+    lines.append(
+        f"- **M2** — {m2['name']}: gate cost {_fmt(m2['gate_cost'], '.6f')} vs "
+        f"target {_fmt(m2['target'], '.6f')} (1.6x always-route {_fmt(m2['always_route_cost'], '.6f')}); "
+        f"ratio {_fmt(m2['ratio'])} → **{'PASS' if m2['pass'] else 'FAIL'}**."
+    )
+    lines.append(
+        f"- **M3** — {m3['name']}: gate-precision {_fmt(m3['gate_precision'])} "
+        f"({m3['n_escalated_improved']}/{m3['n_escalated']} escalated beat best single) "
+        f"→ **{'measured' if m3['pass'] else 'undefined/none'}**."
+    )
+    lines.append("")
+    if retrain_block:
+        lines.append(retrain_block)
+        lines.append("")
+    lines.append("## Live run (keyed, real spend)")
+    lines.append("")
+    lines.append(
+        "The committed numbers above are from MOCK fixtures and a deterministic "
+        "stub executor — **zero spend, no network**. To produce real M1–M4 "
+        "numbers you must run keyed against real models:"
+    )
+    lines.append("")
+    lines.append("```bash")
+    lines.append("# 1. Provide an OpenRouter key (never commit it):")
+    lines.append("export OPENROUTER_API_KEY=sk-...        # or: export API_KEYS='{\"OpenRouter\": \"sk-...\"}'")
+    lines.append("")
+    lines.append("# 2. Build the real router from the plugin config and route+fuse a")
+    lines.append("#    real benchmark slice (GSM8K/MATH/GPQA/MBPP), scoring answers with")
+    lines.append("#    llmrouter/data/api_calling_evaluation.eval_perf. The real")
+    lines.append("#    FusionGateRouter + FusionExecutor make the openrouter:fusion calls;")
+    lines.append("#    all OpenRouter HTTP specifics stay inside executor.py.")
+    lines.append("#    (This offline harness does NOT make live calls by design.)")
+    lines.append("```")
+    lines.append("")
+    lines.append(
+        "M4 (the offline log→retrain quality delta) is produced by `retrain.py`; "
+        "its mock delta is reported above when `--with-retrain` is passed."
+    )
+    return "\n".join(lines) + "\n"
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def build_mock_harness(
+    *,
+    dataset_path: str | Path,
+    llm_path: str | Path,
+    routing_path: str | Path | None,
+    threshold: float,
+    budget_threshold: float | None,
+    k: int,
+    judge: str | None,
+    panel_preset: str,
+) -> tuple[EvalHarness, list[dict[str, Any]], str]:
+    """Construct an offline mock harness from fixture paths. Returns (harness, dataset, dataset_path)."""
+    dataset = load_jsonl(dataset_path)
+    llm_data = load_llm_candidates(llm_path)
+    routing_data = load_jsonl(routing_path) if routing_path and Path(routing_path).exists() else None
+    records_by_query = {r["query"]: r for r in dataset}
+    executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query)
+    harness = EvalHarness(
+        dataset=dataset,
+        llm_data=llm_data,
+        executor=executor,
+        routing_data=routing_data,
+        threshold=threshold,
+        budget_threshold=budget_threshold,
+        k=k,
+        judge=judge,
+        panel_preset=panel_preset,
+    )
+    return harness, dataset, str(dataset_path)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--mock", action="store_true", default=True,
+                        help="Offline mock mode (default; zero spend, no network).")
+    # --live is intentionally hidden from --help: this harness is offline-only and
+    # passing it is an immediate error (see the parser.error below). It is kept
+    # (suppressed) so a stray --live yields a clear "live mode not supported"
+    # message rather than an opaque "unrecognized arguments" failure.
+    parser.add_argument("--live", dest="mock", action="store_false",
+                        help=argparse.SUPPRESS)
+    parser.add_argument("--dataset", default=str(_FIXTURES_DIR / "hard_slice.jsonl"),
+                        help="Hard-slice JSONL dataset.")
+    parser.add_argument("--llm", default=str(_FIXTURES_DIR / "llm_candidates.json"),
+                        help="Candidate-metadata JSON (default_llm.json shape).")
+    parser.add_argument("--routing", default=str(_FIXTURES_DIR / "routing_data.jsonl"),
+                        help="Per-model routing performance JSONL (capability source).")
+    parser.add_argument("--out", default=str(Path(__file__).resolve().parent / "out"),
+                        help="Output directory for results.csv and results.md.")
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--budget-threshold", type=float, default=0.3)
+    parser.add_argument("--k", type=int, default=2,
+                        help="Fusion panel size. Default 2 keeps the panel cost-bounded "
+                             "so the hard-slice blended cost stays within the M2 target; "
+                             "the plugin config's k=3 trades cost for breadth.")
+    parser.add_argument("--judge", default=None)
+    parser.add_argument("--panel-preset", default="Quality")
+    parser.add_argument("--with-retrain", action="store_true",
+                        help="Append the mock retrain (M3 before/after) delta to results.md.")
+    args = parser.parse_args(argv)
+
+    if not args.mock:
+        parser.error(
+            "Live mode is intentionally not wired into this offline harness so a "
+            "stray run cannot spend. Use the keyed live-run path documented in "
+            "results.md (build the real FusionGateRouter + FusionExecutor)."
+        )
+
+    harness, dataset, dataset_path = build_mock_harness(
+        dataset_path=args.dataset,
+        llm_path=args.llm,
+        routing_path=args.routing,
+        threshold=args.threshold,
+        budget_threshold=args.budget_threshold,
+        k=args.k,
+        judge=args.judge,
+        panel_preset=args.panel_preset,
+    )
+    arms = harness.run_all()
+    verdicts = compute_verdicts(arms)
+
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_results_csv(arms, out_dir / "results.csv")
+
+    retrain_block = None
+    if args.with_retrain:
+        # Lazy import to keep the base harness dependency-light.
+        from . import retrain as _retrain  # type: ignore
+
+        retrain_block = _retrain.mock_retrain_report_block(
+            dataset=dataset,
+            llm_path=args.llm,
+            routing_path=args.routing,
+            k=args.k,
+            judge=args.judge,
+        )
+
+    # Report a portable repo-relative path so the committed results.md is not
+    # tied to one machine's home directory.
+    try:
+        display_path = os.path.relpath(dataset_path, _PLUGIN_DIR.parents[1])
+    except ValueError:  # pragma: no cover - different drive on some platforms
+        display_path = dataset_path
+    md = render_results_md(
+        arms, verdicts, mock=args.mock, dataset_path=display_path,
+        n=len(dataset), retrain_block=retrain_block,
+    )
+    (out_dir / "results.md").write_text(md, encoding="utf-8")
+
+    print(f"Wrote {out_dir / 'results.csv'}")
+    print(f"Wrote {out_dir / 'results.md'}")
+    for arm, r in arms.items():
+        print(f"  {arm}: quality={r.quality:.4f} cost={r.blended_cost:.6f} "
+              f"p={r.escalation_p:.4f} gate_precision={_fmt(r.gate_precision)}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl b/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl
new file mode 100644
index 0000000..94c1353
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl
@@ -0,0 +1,16 @@
+{"id": "gsm8k-easy-01", "task_name": "gsm8k", "category": "math", "query": "What is 48 plus 24?", "ground_truth": "72", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "72", "llama-3.1-8b-instruct": "72", "mistral-7b-instruct-v0.3": "72", "llama-3.3-nemotron-super-49b-v1": "72", "llama3-70b-instruct": "72", "mixtral-8x7b-instruct-v0.1": "72", "mixtral-8x22b-instruct-v0.1": "72"}, "single_best_answer": "72", "fusion_answer": "72"}
+{"id": "gsm8k-easy-02", "task_name": "gsm8k", "category": "math", "query": "Half of 6 is what?", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"}
+{"id": "gpqa-easy-01", "task_name": "gpqa", "category": "reasoning", "query": "Pick the option. Water boils at 100C at sea level. A,B,C,D?", "ground_truth": "C", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "C", "llama-3.1-8b-instruct": "C", "mistral-7b-instruct-v0.3": "C", "llama-3.3-nemotron-super-49b-v1": "C", "llama3-70b-instruct": "C", "mixtral-8x7b-instruct-v0.1": "C", "mixtral-8x22b-instruct-v0.1": "C"}, "single_best_answer": "C", "fusion_answer": "C"}
+{"id": "mbpp-easy-01", "task_name": "mbpp", "category": "code", "query": "Return the length of the list [1,2,3].", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"}
+{"id": "math-easy-01", "task_name": "math", "category": "math", "query": "Add 2 and 1.", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"}
+{"id": "gpqa-easy-02", "task_name": "gpqa", "category": "reasoning", "query": "Choose the answer. The sky is blue. A,B,C,D?", "ground_truth": "A", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "A", "llama-3.1-8b-instruct": "A", "mistral-7b-instruct-v0.3": "A", "llama-3.3-nemotron-super-49b-v1": "A", "llama3-70b-instruct": "A", "mixtral-8x7b-instruct-v0.1": "A", "mixtral-8x22b-instruct-v0.1": "A"}, "single_best_answer": "A", "fusion_answer": "A"}
+{"id": "gsm8k-hard-01", "task_name": "gsm8k", "category": "math", "query": "Weng earns $12 an hour for babysitting. Yesterday she did 50 minutes of babysitting. Reason step by step about the unit conversion proof and box the dollar amount earned.", "ground_truth": "10", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "6", "llama-3.1-8b-instruct": "12", "mistral-7b-instruct-v0.3": "9", "llama-3.3-nemotron-super-49b-v1": "8", "llama3-70b-instruct": "11", "mixtral-8x7b-instruct-v0.1": "6", "mixtral-8x22b-instruct-v0.1": "11"}, "single_best_answer": "6", "fusion_answer": "10"}
+{"id": "gsm8k-hard-02", "task_name": "gsm8k", "category": "math", "query": "Betty saves for a $100 wallet, has half, parents give $15, grandparents twice the parents. Reason step by step and box how much more she needs after all contributions.", "ground_truth": "5", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "20", "llama-3.1-8b-instruct": "10", "mistral-7b-instruct-v0.3": "15", "llama-3.3-nemotron-super-49b-v1": "0", "llama3-70b-instruct": "20", "mixtral-8x7b-instruct-v0.1": "15", "mixtral-8x22b-instruct-v0.1": "10"}, "single_best_answer": "15", "fusion_answer": "5"}
+{"id": "math-hard-01", "task_name": "math", "category": "math", "query": "Evaluate the derivative of x^2 at x=3 using the limit-definition theorem; provide a rigorous proof of each algebra step and box the final integer value.", "ground_truth": "6", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "9", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "12", "llama-3.3-nemotron-super-49b-v1": "9", "llama3-70b-instruct": "9", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "9"}, "single_best_answer": "9", "fusion_answer": "6"}
+{"id": "math-hard-02", "task_name": "math", "category": "math", "query": "Compute the probability of rolling a sum of 7 on two dice via the conditional-probability equation; prove the counting complexity step by step and box the simplified fraction.", "ground_truth": "1/6", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "1/8", "llama-3.1-8b-instruct": "5/36", "mistral-7b-instruct-v0.3": "1/12", "llama-3.3-nemotron-super-49b-v1": "1/9", "llama3-70b-instruct": "5/36", "mixtral-8x7b-instruct-v0.1": "1/9", "mixtral-8x22b-instruct-v0.1": "5/36"}, "single_best_answer": "5/36", "fusion_answer": "1/6"}
+{"id": "math-hard-03", "task_name": "math", "category": "math", "query": "Solve for the 2x2 matrix determinant [[2,1],[1,3]] using the cofactor-expansion theorem; prove each algebra step of the equation and box the final integer.", "ground_truth": "5", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "7", "llama-3.1-8b-instruct": "6", "mistral-7b-instruct-v0.3": "1", "llama-3.3-nemotron-super-49b-v1": "7", "llama3-70b-instruct": "6", "mixtral-8x7b-instruct-v0.1": "1", "mixtral-8x22b-instruct-v0.1": "7"}, "single_best_answer": "7", "fusion_answer": "5"}
+{"id": "gpqa-hard-01", "task_name": "gpqa", "category": "reasoning", "query": "Reason step by step about this hard physics question and deduce the correct option explaining the observed spectral line shift; use logic to plan the strategy. Options A,B,C,D. Box the letter.", "ground_truth": "C", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "A", "llama-3.1-8b-instruct": "B", "mistral-7b-instruct-v0.3": "D", "llama-3.3-nemotron-super-49b-v1": "A", "llama3-70b-instruct": "B", "mixtral-8x7b-instruct-v0.1": "A", "mixtral-8x22b-instruct-v0.1": "B"}, "single_best_answer": "A", "fusion_answer": "C"}
+{"id": "gpqa-hard-02", "task_name": "gpqa", "category": "reasoning", "query": "Use step by step logic to deduce the molecular geometry puzzle; the reasoning strategy requires VSEPR. Plan and explain why the answer holds. Options A,B,C,D. Box the letter.", "ground_truth": "B", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "D", "llama-3.1-8b-instruct": "A", "mistral-7b-instruct-v0.3": "A", "llama-3.3-nemotron-super-49b-v1": "C", "llama3-70b-instruct": "A", "mixtral-8x7b-instruct-v0.1": "D", "mixtral-8x22b-instruct-v0.1": "C"}, "single_best_answer": "A", "fusion_answer": "B"}
+{"id": "mbpp-hard-01", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug the regex and count vowels in 'algorithm'; the code must compile. def count_vowels(s): ... return the integer count. Provide the function and result.", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "2", "llama-3.1-8b-instruct": "4", "mistral-7b-instruct-v0.3": "2", "llama-3.3-nemotron-super-49b-v1": "4", "llama3-70b-instruct": "2", "mixtral-8x7b-instruct-v0.1": "4", "mixtral-8x22b-instruct-v0.1": "2"}, "single_best_answer": "2", "fusion_answer": "3"}
+{"id": "mbpp-hard-02", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug this code and return the factorial of 5; the algorithm must compile. def fact(n): ... Provide the function and the final integer result.", "ground_truth": "120", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "60", "llama-3.1-8b-instruct": "24", "mistral-7b-instruct-v0.3": "20", "llama-3.3-nemotron-super-49b-v1": "60", "llama3-70b-instruct": "24", "mixtral-8x7b-instruct-v0.1": "20", "mixtral-8x22b-instruct-v0.1": "60"}, "single_best_answer": "60", "fusion_answer": "120"}
+{"id": "mbpp-hard-03", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug and compute the sum of the algorithm's list [3,7,2,8]; the code must compile. def list_sum(xs): ... Provide the function and the integer result.", "ground_truth": "20", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "18", "llama-3.1-8b-instruct": "19", "mistral-7b-instruct-v0.3": "18", "llama-3.3-nemotron-super-49b-v1": "19", "llama3-70b-instruct": "18", "mixtral-8x7b-instruct-v0.1": "21", "mixtral-8x22b-instruct-v0.1": "19"}, "single_best_answer": "18", "fusion_answer": "20"}
diff --git a/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json b/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json
new file mode 100644
index 0000000..1b0ca9e
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json
@@ -0,0 +1,58 @@
+{
+  "qwen2.5-7b-instruct": {
+    "size": "7B",
+    "feature": "Fast efficient small model for instruction following.",
+    "input_price": 0.20,
+    "output_price": 0.20,
+    "model": "qwen/qwen2.5-7b-instruct",
+    "service": "OpenRouter"
+  },
+  "llama-3.1-8b-instruct": {
+    "size": "8B",
+    "feature": "Conversational reasoning model with reasonable cost.",
+    "input_price": 0.20,
+    "output_price": 0.20,
+    "model": "meta/llama-3.1-8b-instruct",
+    "service": "OpenRouter"
+  },
+  "mistral-7b-instruct-v0.3": {
+    "size": "7B",
+    "feature": "Fast efficient instruction-following model.",
+    "input_price": 0.20,
+    "output_price": 0.20,
+    "model": "mistralai/mistral-7b-instruct-v0.3",
+    "service": "OpenRouter"
+  },
+  "llama-3.3-nemotron-super-49b-v1": {
+    "size": "49B",
+    "feature": "Powerful high-accuracy model for complex demanding tasks.",
+    "input_price": 0.90,
+    "output_price": 0.90,
+    "model": "nvidia/llama-3.3-nemotron-super-49b-v1",
+    "service": "OpenRouter"
+  },
+  "llama3-70b-instruct": {
+    "size": "70B",
+    "feature": "Powerful large model for comprehensive understanding.",
+    "input_price": 0.90,
+    "output_price": 0.90,
+    "model": "meta/llama3-70b-instruct",
+    "service": "OpenRouter"
+  },
+  "mixtral-8x7b-instruct-v0.1": {
+    "size": "45B",
+    "feature": "Mixture of experts model optimized for creative generation.",
+    "input_price": 0.60,
+    "output_price": 0.60,
+    "model": "mistralai/mixtral-8x7b-instruct-v0.1",
+    "service": "OpenRouter"
+  },
+  "mixtral-8x22b-instruct-v0.1": {
+    "size": "141B",
+    "feature": "Advanced large-scale mixture of experts with exceptional performance.",
+    "input_price": 1.20,
+    "output_price": 1.20,
+    "model": "mistralai/mixtral-8x22b-instruct-v0.1",
+    "service": "OpenRouter"
+  }
+}
diff --git a/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl b/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl
new file mode 100644
index 0000000..d6e568a
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl
@@ -0,0 +1,28 @@
+{"task_name": "gsm8k", "model_name": "qwen2.5-7b-instruct", "performance": 0.55}
+{"task_name": "gsm8k", "model_name": "llama-3.1-8b-instruct", "performance": 0.50}
+{"task_name": "gsm8k", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.35}
+{"task_name": "gsm8k", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.78}
+{"task_name": "gsm8k", "model_name": "llama3-70b-instruct", "performance": 0.70}
+{"task_name": "gsm8k", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.45}
+{"task_name": "gsm8k", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.82}
+{"task_name": "math", "model_name": "qwen2.5-7b-instruct", "performance": 0.40}
+{"task_name": "math", "model_name": "llama-3.1-8b-instruct", "performance": 0.45}
+{"task_name": "math", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.30}
+{"task_name": "math", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.72}
+{"task_name": "math", "model_name": "llama3-70b-instruct", "performance": 0.66}
+{"task_name": "math", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.38}
+{"task_name": "math", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.75}
+{"task_name": "gpqa-reasoning", "model_name": "qwen2.5-7b-instruct", "performance": 0.30}
+{"task_name": "gpqa-reasoning", "model_name": "llama-3.1-8b-instruct", "performance": 0.42}
+{"task_name": "gpqa-reasoning", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.25}
+{"task_name": "gpqa-reasoning", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.70}
+{"task_name": "gpqa-reasoning", "model_name": "llama3-70b-instruct", "performance": 0.60}
+{"task_name": "gpqa-reasoning", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.35}
+{"task_name": "gpqa-reasoning", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.74}
+{"task_name": "mbpp-code", "model_name": "qwen2.5-7b-instruct", "performance": 0.48}
+{"task_name": "mbpp-code", "model_name": "llama-3.1-8b-instruct", "performance": 0.58}
+{"task_name": "mbpp-code", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.52}
+{"task_name": "mbpp-code", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.76}
+{"task_name": "mbpp-code", "model_name": "llama3-70b-instruct", "performance": 0.72}
+{"task_name": "mbpp-code", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.50}
+{"task_name": "mbpp-code", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.80}
diff --git a/custom_routers/fusion_gate/eval/retrain.py b/custom_routers/fusion_gate/eval/retrain.py
new file mode 100644
index 0000000..e0e2a42
--- /dev/null
+++ b/custom_routers/fusion_gate/eval/retrain.py
@@ -0,0 +1,464 @@
+"""retrain — scripted, repeatable gate + capability refit from fusion logs (UMB-126).
+
+Closes the loop: logged fusion calls (``fusion_log`` JSONL format, i.e. each line
+``{ts, strategy, query, panel, judge, responses[], analysis, token, cost}``) are
+fed back into the ``api_calling_evaluation`` training-row format, the routing
+table is AUGMENTED with the per-model performance those responses imply, and the
+gate + capability scorer are REFIT on the augmented data. We then re-measure M3
+(gate-precision) BEFORE vs AFTER and report the delta.
+
+Pipeline (offline, deterministic, zero spend):
+
+  1. Load logged fusion responses (``fusion_log`` JSONL). Each ``responses[]``
+     entry is decomposed via the same shape ``fusion_log.to_training_rows``
+     emits: ``{query, model_name, response, performance, ...}``.
+  2. Grade each decomposed response against the hard-slice ground truth
+     (offline exact-match; the live path would use
+     ``api_calling_evaluation.eval_perf``) to fill ``performance``.
+  3. Build routing rows ``{task_name, model_name, performance}`` from the graded
+     responses and AUGMENT the base routing table with them.
+  4. Refit: a fresh :class:`CapabilityScorer` over the augmented routing data
+     (capability scores), and re-tune the gate threshold from the augmented
+     difficulty/quality signal (gate refit).
+  5. Re-run the fusion-gate arm with the refit components and report M3 before
+     vs after as a delta in the report.
+
+``--mock`` path: synthesizes a fusion log from the bundled fixtures (so there is
+something to replay with zero spend) and runs the full before/after measurement.
+Live path: point ``--log`` at a real ``fusion_log`` sink produced by keyed
+``FusionGateRouter.fuse`` calls; same code path, real responses.
+
+This module never imports torch/pandas and makes no network call.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from .eval_harness import (
+    FUSION_TIERS,
+    EvalHarness,
+    MockFusionExecutor,
+    RouteGate,
+    best_single_answer,
+    load_jsonl,
+    load_llm_candidates,
+    score_answer,
+    _FIXTURES_DIR,
+)
+
+
+# ---------------------------------------------------------------------------
+# Step 1–2: replay fusion log into graded training rows
+# ---------------------------------------------------------------------------
+
+
+def synthesize_fusion_log(
+    dataset: list[dict[str, Any]],
+    llm_data: dict[str, Any],
+    routing_data: list[dict[str, Any]] | None,
+    *,
+    k: int,
+    judge: str | None,
+    threshold: float = 0.5,
+    budget_threshold: float | None = 0.3,
+) -> list[dict[str, Any]]:
+    """Produce a fusion-log-shaped list by running the mock harness's fuse path.
+
+    This gives the retrain loop something to replay offline with zero spend, in
+    exactly the ``fusion_log`` JSONL shape a live run would persist. Only queries
+    the gate escalates are logged (mirroring the real fuse-only logging).
+    """
+    records_by_query = {r["query"]: r for r in dataset}
+    executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query)
+    harness = EvalHarness(
+        dataset=dataset, llm_data=llm_data, executor=executor,
+        routing_data=routing_data, threshold=threshold,
+        budget_threshold=budget_threshold, k=k, judge=judge,
+    )
+    log: list[dict[str, Any]] = []
+    for record in dataset:
+        query = record["query"]
+        decision = harness.gate.decide({"query": query})
+        if decision.tier not in FUSION_TIERS:
+            continue
+        panel = harness._select_panel(query, decision.tier)
+        result = executor.run(query, panel, judge=judge)
+        log.append(
+            {
+                "strategy": "fusion",
+                "query": query,
+                "panel": list(result.panel),
+                "judge": result.judge,
+                "responses": [
+                    {"model": r.get("model"), "content": r.get("content")}
+                    for r in result.responses
+                ],
+                "analysis": result.analysis,
+                "token": None,
+                "cost": result.cost,
+            }
+        )
+    return log
+
+
+def grade_log_to_training_rows(
+    log: list[dict[str, Any]],
+    dataset: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Decompose ``responses[]`` to graded training rows (api_calling_evaluation shape).
+
+    Mirrors ``fusion_log.to_training_rows`` ({query, model_name, response,
+    performance, ...}) but FILLS ``performance`` by grading each response against
+    the matching dataset record's ground truth (offline exact-match). The live
+    path would grade via ``api_calling_evaluation.eval_perf``.
+    """
+    gt_by_query = {r["query"]: r for r in dataset}
+    rows: list[dict[str, Any]] = []
+    for entry in log:
+        query = entry.get("query", "")
+        record = gt_by_query.get(query, {})
+        gt = record.get("ground_truth")
+        task_name = record.get("task_name")
+        for resp in entry.get("responses", []) or []:
+            model = resp.get("model")
+            content = resp.get("content")
+            perf = score_answer(content, gt) if gt is not None else None
+            rows.append(
+                {
+                    "query": query,
+                    "task_name": task_name,
+                    "model_name": model,
+                    "model": model,
+                    "response": content,
+                    "performance": perf,
+                    "strategy": "fusion",
+                    "judge": entry.get("judge"),
+                }
+            )
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Step 3: augment the routing table
+# ---------------------------------------------------------------------------
+
+
+def augment_routing_data(
+    base_routing: list[dict[str, Any]] | None,
+    training_rows: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Append graded (task_name, model_name, performance) rows to the base table.
+
+    Rows missing a model or performance are skipped. The result is consumable by
+    :class:`CapabilityScorer` (it keys on ``task_name`` / ``model_name`` /
+    ``performance``), so a fresh scorer over this list is the capability refit.
+    """
+    augmented: list[dict[str, Any]] = list(base_routing or [])
+    for row in training_rows:
+        if row.get("model_name") is None or row.get("performance") is None:
+            continue
+        augmented.append(
+            {
+                "task_name": row.get("task_name") or "fusion",
+                "model_name": row["model_name"],
+                "performance": float(row["performance"]),
+            }
+        )
+    return augmented
+
+
+# ---------------------------------------------------------------------------
+# Step 4: refit the gate threshold
+# ---------------------------------------------------------------------------
+
+
+def refit_gate_thresholds(
+    dataset: list[dict[str, Any]],
+    training_rows: list[dict[str, Any]],
+    *,
+    current_threshold: float,
+    current_budget_threshold: float | None,
+) -> tuple[float, float | None]:
+    """Re-tune the gate thresholds from the augmented quality signal.
+
+    Heuristic, deterministic refit examining the logged fusion responses:
+
+      * Where fusion was logged but the BEST single answer was already correct,
+        the escalation was WASTED — raise the lower (``budget_threshold``) floor
+        so those low-difficulty queries route single next time, lifting M3
+        precision (fewer non-improving escalations).
+      * The upper ``threshold`` is nudged by how reliably fusion BEAT the best
+        single answer overall: more help → lower it (escalate more), less →
+        raise it. Bounded so the refit can never disable the gate.
+
+    Returns ``(threshold, budget_threshold)``. The live path could fit a learned
+    ``DifficultyEstimator`` here instead; this offline refit stays torch-free and
+    reproducible.
+    """
+    gt_by_query = {r["query"]: r for r in dataset}
+    by_query: dict[str, list[dict[str, Any]]] = {}
+    for row in training_rows:
+        by_query.setdefault(row["query"], []).append(row)
+
+    helped = 0
+    wasted = 0
+    total = 0
+    wasted_difficulties: list[float] = []
+    # A throwaway gate purely to score difficulty consistently with the harness.
+    diff_gate = RouteGate(llm_data={"_": {}})
+    for query, rows in by_query.items():
+        record = gt_by_query.get(query)
+        if record is None:
+            continue
+        gt = record.get("ground_truth")
+        if gt is None:
+            continue
+        total += 1
+        fusion_best = max((r.get("performance") or 0.0) for r in rows) if rows else 0.0
+        single_correct = score_answer(best_single_answer(record), gt) >= 1.0
+        if fusion_best >= 1.0 and not single_correct:
+            helped += 1
+        elif single_correct:
+            # Fusion was logged but a single model already had it: wasted spend.
+            wasted += 1
+            wasted_difficulties.append(diff_gate._lexical_difficulty(query))
+
+    if total == 0:
+        return current_threshold, current_budget_threshold
+
+    help_rate = helped / total
+    delta = (0.3 - help_rate) * 0.4
+    threshold = max(0.1, min(0.9, current_threshold + delta))
+
+    # Raise the budget floor just above the hardest WASTED escalation so those
+    # low-value queries route single, without crossing the upper threshold.
+    budget_threshold = current_budget_threshold
+    if wasted_difficulties:
+        floor = max(wasted_difficulties) + 1e-3
+        base = current_budget_threshold if current_budget_threshold is not None else 0.0
+        budget_threshold = min(threshold, max(base, floor))
+
+    return threshold, budget_threshold
+
+
+# ---------------------------------------------------------------------------
+# Step 5: before/after M3 measurement
+# ---------------------------------------------------------------------------
+
+
+def measure_m3(
+    dataset: list[dict[str, Any]],
+    llm_data: dict[str, Any],
+    routing_data: list[dict[str, Any]] | None,
+    *,
+    threshold: float,
+    budget_threshold: float | None,
+    k: int,
+    judge: str | None,
+) -> tuple[float | None, int, int]:
+    """Run the fusion-gate arm and return (gate_precision, n_escalated, n_improved)."""
+    records_by_query = {r["query"]: r for r in dataset}
+    executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query)
+    harness = EvalHarness(
+        dataset=dataset, llm_data=llm_data, executor=executor,
+        routing_data=routing_data, threshold=threshold,
+        budget_threshold=budget_threshold, k=k, judge=judge,
+    )
+    arm = harness.run_fusion_gate()
+    return arm.gate_precision, arm.n_escalated, arm.n_escalated_improved
+
+
+def run_retrain(
+    dataset: list[dict[str, Any]],
+    llm_data: dict[str, Any],
+    base_routing: list[dict[str, Any]] | None,
+    *,
+    log: list[dict[str, Any]],
+    k: int,
+    judge: str | None,
+    threshold: float = 0.5,
+    budget_threshold: float | None = 0.3,
+) -> dict[str, Any]:
+    """Full retrain loop. Returns a structured before/after result dict."""
+    # BEFORE: measure M3 on the base routing data + base threshold.
+    before_m3, before_esc, before_imp = measure_m3(
+        dataset, llm_data, base_routing,
+        threshold=threshold, budget_threshold=budget_threshold, k=k, judge=judge,
+    )
+
+    # Replay the log -> graded rows -> augmented routing + refit thresholds.
+    training_rows = grade_log_to_training_rows(log, dataset)
+    augmented_routing = augment_routing_data(base_routing, training_rows)
+    refit_threshold, refit_budget_threshold = refit_gate_thresholds(
+        dataset, training_rows,
+        current_threshold=threshold, current_budget_threshold=budget_threshold,
+    )
+
+    # AFTER: measure M3 with the refit capability data + refit thresholds.
+    after_m3, after_esc, after_imp = measure_m3(
+        dataset, llm_data, augmented_routing,
+        threshold=refit_threshold, budget_threshold=refit_budget_threshold,
+        k=k, judge=judge,
+    )
+
+    delta = None
+    if before_m3 is not None and after_m3 is not None:
+        delta = after_m3 - before_m3
+
+    return {
+        "n_log_entries": len(log),
+        "n_training_rows": len(training_rows),
+        "n_base_routing_rows": len(base_routing or []),
+        "n_augmented_routing_rows": len(augmented_routing),
+        "threshold_before": threshold,
+        "threshold_after": refit_threshold,
+        "budget_threshold_before": budget_threshold,
+        "budget_threshold_after": refit_budget_threshold,
+        "m3_before": before_m3,
+        "m3_after": after_m3,
+        "m3_delta": delta,
+        "escalated_before": before_esc,
+        "escalated_after": after_esc,
+        "improved_before": before_imp,
+        "improved_after": after_imp,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+
+def _fmt(value: Any, spec: str = ".4f") -> str:
+    return "n/a" if value is None else format(value, spec)
+
+
+def render_retrain_block(result: dict[str, Any], *, mock: bool) -> str:
+    """Render the retrain delta as a markdown block appended to results.md."""
+    src = "MOCK fixtures (synthesized fusion log, zero spend)" if mock else "LIVE fusion log"
+    lines = [
+        "## Retrain (UMB-126): M3 before vs after",
+        "",
+        f"- Source: {src}",
+        f"- Replayed {result['n_log_entries']} fusion-log entries → "
+        f"{result['n_training_rows']} graded training rows.",
+        f"- Routing table augmented: {result['n_base_routing_rows']} → "
+        f"{result['n_augmented_routing_rows']} rows.",
+        f"- Gate threshold refit: {_fmt(result['threshold_before'], '.3f')} → "
+        f"{_fmt(result['threshold_after'], '.3f')}.",
+        f"- Gate budget_threshold refit: {_fmt(result.get('budget_threshold_before'), '.3f')} → "
+        f"{_fmt(result.get('budget_threshold_after'), '.3f')} "
+        "(raised so wasted low-difficulty escalations route single).",
+        "",
+        "| Metric | Before | After | Delta |",
+        "|--------|--------|-------|-------|",
+        f"| M3 gate-precision | {_fmt(result['m3_before'])} | {_fmt(result['m3_after'])} | "
+        f"{_fmt(result['m3_delta'], '+.4f')} |",
+        f"| Escalated | {result['escalated_before']} | {result['escalated_after']} | "
+        f"{result['escalated_after'] - result['escalated_before']:+d} |",
+        f"| Escalated-and-improved | {result['improved_before']} | {result['improved_after']} | "
+        f"{result['improved_after'] - result['improved_before']:+d} |",
+    ]
+    if mock:
+        lines.append("")
+        lines.append(
+            "> Retrain numbers are from MOCK fixtures; the real M3 delta (M4) "
+            "requires a keyed live run replaying a real fusion-log sink."
+        )
+    return "\n".join(lines)
+
+
+def mock_retrain_report_block(
+    *,
+    dataset: list[dict[str, Any]],
+    llm_path: str | Path,
+    routing_path: str | Path | None,
+    k: int,
+    judge: str | None,
+) -> str:
+    """Convenience used by eval_harness --with-retrain: run mock retrain, return md block."""
+    llm_data = load_llm_candidates(llm_path)
+    base_routing = (
+        load_jsonl(routing_path)
+        if routing_path and Path(routing_path).exists()
+        else None
+    )
+    # Deliberately LOOSE before-thresholds so the mock before-state over-escalates
+    # (escalates easy queries the best single model already solves). The refit then
+    # raises the budget floor, removing those wasted escalations and lifting M3 —
+    # demonstrating the loop produces a real, positive before→after delta offline.
+    before_threshold = 0.4
+    before_budget_threshold = 0.1
+    log = synthesize_fusion_log(
+        dataset, llm_data, base_routing, k=k, judge=judge,
+        threshold=before_threshold, budget_threshold=before_budget_threshold,
+    )
+    result = run_retrain(
+        dataset, llm_data, base_routing, log=log, k=k, judge=judge,
+        threshold=before_threshold, budget_threshold=before_budget_threshold,
+    )
+    return render_retrain_block(result, mock=True)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--mock", action="store_true", default=True,
+                        help="Offline mock mode (default; synthesizes a log from fixtures, zero spend).")
+    parser.add_argument("--dataset", default=str(_FIXTURES_DIR / "hard_slice.jsonl"))
+    parser.add_argument("--llm", default=str(_FIXTURES_DIR / "llm_candidates.json"))
+    parser.add_argument("--routing", default=str(_FIXTURES_DIR / "routing_data.jsonl"))
+    parser.add_argument("--log", default=None,
+                        help="Path to a fusion_log JSONL sink to replay. "
+                             "When omitted in --mock, a log is synthesized from fixtures.")
+    parser.add_argument("--out", default=str(Path(__file__).resolve().parent / "out"))
+    parser.add_argument("--k", type=int, default=2,
+                        help="Fusion panel size (default 2, matching eval_harness).")
+    parser.add_argument("--judge", default=None)
+    # Loose before-thresholds by default so the offline demonstration shows a
+    # real before→after M3 lift (the loose gate over-escalates easy queries;
+    # the refit raises the budget floor to remove that wasted spend). Tighten
+    # these to match a live config when replaying a real fusion-log sink.
+    parser.add_argument("--threshold", type=float, default=0.4)
+    parser.add_argument("--budget-threshold", type=float, default=0.1)
+    args = parser.parse_args(argv)
+
+    dataset = load_jsonl(args.dataset)
+    llm_data = load_llm_candidates(args.llm)
+    base_routing = load_jsonl(args.routing) if Path(args.routing).exists() else None
+
+    if args.log and Path(args.log).exists():
+        log = load_jsonl(args.log)
+    else:
+        log = synthesize_fusion_log(
+            dataset, llm_data, base_routing, k=args.k, judge=args.judge,
+            threshold=args.threshold, budget_threshold=args.budget_threshold,
+        )
+
+    result = run_retrain(
+        dataset, llm_data, base_routing, log=log, k=args.k, judge=args.judge,
+        threshold=args.threshold, budget_threshold=args.budget_threshold,
+    )
+    block = render_retrain_block(result, mock=True)
+
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "retrain.md").write_text(block + "\n", encoding="utf-8")
+    (out_dir / "retrain.json").write_text(json.dumps(result, indent=2), encoding="utf-8")
+
+    print(f"Wrote {out_dir / 'retrain.md'}")
+    print(f"M3 before={_fmt(result['m3_before'])} after={_fmt(result['m3_after'])} "
+          f"delta={_fmt(result['m3_delta'], '+.4f')}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/custom_routers/fusion_gate/executor.py b/custom_routers/fusion_gate/executor.py
new file mode 100644
index 0000000..5cd98a4
--- /dev/null
+++ b/custom_routers/fusion_gate/executor.py
@@ -0,0 +1,429 @@
+"""FusionExecutor — isolates the OpenRouter `openrouter:fusion` call (UMB-120).
+
+SCAFFOLD ONLY. This is the single blast point for the beta server-tool API:
+all OpenRouter-specific request/response handling lives here and nowhere else,
+so upstream changes touch one file. UMB-120 implements `run`; UMB-128 may add a
+local fan-out path behind the same interface.
+
+OpenRouter call shape (for the implementer):
+    POST {api_endpoint or https://openrouter.ai/api/v1}/chat/completions
+    body: {
+      "model": <outer model>,
+      "messages": [{"role": "user", "content": query}],
+      "tools": [{"type": "openrouter:fusion",
+                 "parameters": {"analysis_models": panel, "model": judge}}],
+      "tool_choice": "required"   # gate already decided to fuse
+    }
+Result tool payload: { status, analysis?, responses: [{model, content}, ...] }
+  - judge may fail → status "ok" with `analysis` omitted; fall back to writing
+    the answer from `responses[]`.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Any
+
+DEFAULT_ENDPOINT = "https://openrouter.ai/api/v1"
+
+# Provider key used to resolve the OpenRouter credential from an API_KEYS dict.
+OPENROUTER_PROVIDER = "OpenRouter"
+
+# Default per-completion output-token estimate used by project_cost when no
+# explicit completion-token count is supplied. Overridable via the
+# ``est_completion_tokens`` hparam.
+DEFAULT_EST_COMPLETION_TOKENS = 512
+
+# Roughly four characters per token — the standard heuristic for estimating
+# prompt token count from raw query text.
+_CHARS_PER_TOKEN = 4
+
+# OpenRouter server-tool identifier (BETA). Confined to this module.
+FUSION_TOOL_TYPE = "openrouter:fusion"
+
+
+class CostCeilingExceeded(Exception):
+    """Raised when the projected fusion cost exceeds the configured ceiling.
+
+    Carries the projected per-query DOLLAR cost and the ceiling (also in dollars)
+    so callers can log/report the abort without re-projecting. Raised BEFORE any
+    HTTP call is made.
+    """
+
+    def __init__(self, projected: float, ceiling: float):
+        self.projected = projected
+        self.ceiling = ceiling
+        super().__init__(
+            f"Projected fusion cost ${projected:.6f} exceeds cost_ceiling "
+            f"${ceiling:.6f} per query; aborting before the OpenRouter call."
+        )
+
+
+class FusionExecutorError(Exception):
+    """Raised on an unrecoverable OpenRouter fusion response (transport/parse)."""
+
+
+@dataclass
+class FusionResult:
+    """Parsed output of a fusion call.
+
+    answer      : final synthesized answer (judge output, or fallback from panel)
+    analysis    : structured analysis JSON (consensus/contradictions/blind_spots),
+                  or None when the judge failed
+    responses   : raw per-model responses [{"model", "content"}] — the training
+                  signal consumed by the log sink (UMB-125)
+    panel       : panel actually used
+    judge       : judge model actually used
+    cost        : total cost (sum of panel completions + judge) when available
+    raw         : the untouched provider payload, for debugging
+    """
+
+    answer: str = ""
+    analysis: dict[str, Any] | None = None
+    responses: list[dict[str, Any]] = field(default_factory=list)
+    panel: list[str] = field(default_factory=list)
+    judge: str | None = None
+    cost: float | None = None
+    raw: dict[str, Any] | None = None
+
+
+class FusionExecutor:
+    def __init__(
+        self,
+        llm_data: dict[str, Any],
+        judge: str | None = None,
+        panel_preset: str = "Quality",
+        cost_ceiling: float | None = None,
+        api_endpoint: str | None = None,
+        est_completion_tokens: int = DEFAULT_EST_COMPLETION_TOKENS,
+    ):
+        self.llm_data = llm_data
+        self.judge = judge
+        self.panel_preset = panel_preset
+        self.cost_ceiling = cost_ceiling
+        self.api_endpoint = api_endpoint or DEFAULT_ENDPOINT
+        self.est_completion_tokens = int(est_completion_tokens)
+
+    def run(
+        self,
+        query: str,
+        panel: list[str],
+        judge: str | None = None,
+        api_keys: dict[str, str] | None = None,
+        **gen_kwargs: Any,
+    ) -> FusionResult:
+        """Execute one fusion call against the OpenRouter `openrouter:fusion` tool.
+
+        A SINGLE POST to ``{api_endpoint}/chat/completions`` carries the panel as
+        the tool's ``analysis_models`` and the judge as the tool's ``model``, with
+        ``tool_choice="required"`` so the gate's fuse decision is honored.
+
+        Args:
+            query: The user query to fuse over.
+            panel: Panel model slugs (-> tool ``analysis_models``).
+            judge: Judge model slug (-> tool ``model``); falls back to the
+                executor's configured judge, then to the outer model when unset.
+            api_keys: Optional ``{"OpenRouter": "<key>"}`` provider dict. When
+                absent, the key is resolved from the ``OPENROUTER_API_KEY`` env
+                var or an ``API_KEYS`` JSON env var.
+            **gen_kwargs: Extra generation params merged into the request body
+                (e.g. ``temperature``, ``max_tokens``).
+
+        Returns:
+            FusionResult with parsed ``responses``/``analysis``. On judge failure
+            (status ``ok`` with ``analysis`` omitted) the answer is synthesized
+            from ``responses`` and ``analysis`` is ``None``.
+
+        Raises:
+            CostCeilingExceeded: when the projected cost exceeds ``cost_ceiling``
+                (raised before any network call).
+            FusionExecutorError: on transport failure or an unparseable payload.
+        """
+        judge = judge or self.judge
+
+        # Cost guard: abort BEFORE the HTTP call so a too-expensive fusion never
+        # reaches the network.
+        if self.cost_ceiling is not None:
+            projected = self.project_cost(panel, judge, query=query)
+            if projected > self.cost_ceiling:
+                raise CostCeilingExceeded(projected, self.cost_ceiling)
+
+        api_key = self._resolve_api_key(api_keys)
+
+        body = self._build_request_body(query, panel, judge, gen_kwargs)
+        payload = self._post_chat_completions(body, api_key)
+        return self._parse_payload(payload, panel, judge)
+
+    # ------------------------------------------------------- OpenRouter (BETA)
+    # Everything below this line is OpenRouter-specific request/response handling
+    # and MUST stay confined to this module (the beta server-tool blast point).
+
+    def _resolve_api_key(self, api_keys: dict[str, str] | None) -> str:
+        """Resolve the OpenRouter key without logging it.
+
+        Resolution order:
+          1. ``api_keys["OpenRouter"]`` (explicit provider dict),
+          2. ``OPENROUTER_API_KEY`` env var,
+          3. ``API_KEYS`` env var parsed as a JSON ``{"OpenRouter": "..."}`` dict.
+
+        The key value is never logged or echoed.
+        """
+        if api_keys:
+            key = api_keys.get(OPENROUTER_PROVIDER)
+            if key:
+                return key
+
+        env_key = os.environ.get("OPENROUTER_API_KEY")
+        if env_key:
+            return env_key
+
+        raw = os.environ.get("API_KEYS")
+        if raw:
+            try:
+                parsed = json.loads(raw)
+            except (ValueError, TypeError) as exc:
+                raise FusionExecutorError(
+                    "API_KEYS env var is not valid JSON; cannot resolve the "
+                    f"{OPENROUTER_PROVIDER} key."
+                ) from exc
+            key = parsed.get(OPENROUTER_PROVIDER) if isinstance(parsed, dict) else None
+            if key:
+                return key
+
+        raise FusionExecutorError(
+            f"No {OPENROUTER_PROVIDER} API key found. Provide api_keys="
+            f'{{"{OPENROUTER_PROVIDER}": "..."}}, set OPENROUTER_API_KEY, or set '
+            "API_KEYS as a JSON object."
+        )
+
+    def _build_request_body(
+        self,
+        query: str,
+        panel: list[str],
+        judge: str | None,
+        gen_kwargs: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Build the chat/completions body carrying the openrouter:fusion tool.
+
+        The outer ``model`` defaults to the judge slug when one is configured,
+        falling back to the panel head; the tool's ``model`` (judge) defaults to
+        the outer model when unset, matching the scaffold contract.
+
+        Raises:
+            ValueError: when ``panel`` is empty. A fusion call has no meaning
+                without at least one analysis model, and an empty ``model`` field
+                would produce a nonsensical OpenRouter request.
+        """
+        if not panel:
+            raise ValueError("panel must be non-empty for a fusion call")
+        outer_model = judge or panel[0]
+        parameters: dict[str, Any] = {"analysis_models": list(panel)}
+        if judge:
+            parameters["model"] = judge
+
+        body: dict[str, Any] = {
+            "model": outer_model,
+            "messages": [{"role": "user", "content": query}],
+            "tools": [{"type": FUSION_TOOL_TYPE, "parameters": parameters}],
+            "tool_choice": "required",
+        }
+        # Allow callers to pass through generation params without overriding the
+        # fusion-defining keys above.
+        for key, value in gen_kwargs.items():
+            if key not in body:
+                body[key] = value
+        return body
+
+    def _post_chat_completions(
+        self, body: dict[str, Any], api_key: str
+    ) -> dict[str, Any]:
+        """POST the request and return the decoded JSON payload.
+
+        Prefers ``requests`` when importable; otherwise uses stdlib ``urllib``.
+        The Authorization header carries the key but is never logged.
+        """
+        url = f"{self.api_endpoint}/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        }
+
+        try:
+            import requests  # type: ignore
+        except ImportError:
+            return self._post_urllib(url, headers, body)
+
+        try:
+            resp = requests.post(url, headers=headers, json=body, timeout=120)
+            resp.raise_for_status()
+            return resp.json()
+        except Exception as exc:  # noqa: BLE001 - normalize transport/HTTP errors
+            # Surface the HTTP status (e.g. 429 / 503) when present so callers can
+            # distinguish a retryable rate-limit/outage from a hard transport
+            # failure. The status code carries no secret; the key/headers/body are
+            # never included in the message.
+            status = getattr(getattr(exc, "response", None), "status_code", None)
+            detail = f" (HTTP {status})" if status is not None else ""
+            raise FusionExecutorError(
+                f"OpenRouter fusion request failed: {type(exc).__name__}{detail}"
+            ) from exc
+
+    def _post_urllib(
+        self, url: str, headers: dict[str, str], body: dict[str, Any]
+    ) -> dict[str, Any]:
+        """stdlib fallback transport for the chat/completions POST."""
+        import urllib.error
+        import urllib.request
+
+        data = json.dumps(body).encode("utf-8")
+        req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+        try:
+            with urllib.request.urlopen(req, timeout=120) as resp:  # noqa: S310
+                raw = resp.read().decode("utf-8")
+            return json.loads(raw)
+        except (urllib.error.URLError, ValueError) as exc:
+            # Mirror the requests path: a urllib HTTPError carries ``.code`` (the
+            # HTTP status); surface it so 429/503 are recoverable from the message.
+            # No secret is included (only the status integer).
+            status = getattr(exc, "code", None)
+            detail = f" (HTTP {status})" if status is not None else ""
+            raise FusionExecutorError(
+                f"OpenRouter fusion request failed: {type(exc).__name__}{detail}"
+            ) from exc
+
+    def _parse_payload(
+        self, payload: dict[str, Any], panel: list[str], judge: str | None
+    ) -> FusionResult:
+        """Parse the OpenRouter fusion tool payload into a FusionResult.
+
+        The tool result is shaped ``{status, analysis?, responses: [...]}``. The
+        ``responses[]`` entries are normalized to ``{"model", "content"}``. When
+        the judge fails (status ``ok`` with ``analysis`` omitted) the answer is
+        synthesized from the panel responses and ``analysis`` is ``None``.
+        """
+        tool = self._extract_tool_result(payload)
+
+        responses: list[dict[str, Any]] = []
+        for item in tool.get("responses", []) or []:
+            if isinstance(item, dict):
+                responses.append(
+                    {"model": item.get("model"), "content": item.get("content", "")}
+                )
+
+        raw_analysis = tool.get("analysis")
+        analysis: dict[str, Any] | None = None
+        answer = tool.get("answer", "")
+        if isinstance(raw_analysis, dict):
+            analysis = {
+                "consensus": raw_analysis.get("consensus"),
+                "contradictions": raw_analysis.get("contradictions"),
+                "blind_spots": raw_analysis.get("blind_spots"),
+            }
+            if not answer:
+                answer = raw_analysis.get("consensus") or ""
+        else:
+            # Judge-failure mode: status "ok" but analysis omitted. Synthesize an
+            # answer from the panel responses; do not crash.
+            answer = self._synthesize_answer(responses)
+
+        cost = tool.get("cost", payload.get("cost"))
+        cost_value = float(cost) if isinstance(cost, (int, float)) else None
+
+        return FusionResult(
+            answer=answer or "",
+            analysis=analysis,
+            responses=responses,
+            panel=list(panel),
+            judge=judge,
+            cost=cost_value,
+            raw=payload,
+        )
+
+    def _extract_tool_result(self, payload: dict[str, Any]) -> dict[str, Any]:
+        """Locate the fusion tool result inside the chat/completions payload.
+
+        Accepts either a top-level tool result (``{status, responses, ...}``) or
+        the tool result nested in the first choice's message tool_calls.
+        """
+        if isinstance(payload, dict) and "responses" in payload:
+            return payload
+
+        choices = payload.get("choices") if isinstance(payload, dict) else None
+        if choices:
+            message = choices[0].get("message", {}) if isinstance(choices[0], dict) else {}
+            tool_calls = message.get("tool_calls") or []
+            for call in tool_calls:
+                if not isinstance(call, dict):
+                    continue
+                result = call.get("result")
+                if isinstance(result, dict):
+                    return result
+                func = call.get("function", {})
+                args = func.get("arguments") if isinstance(func, dict) else None
+                if isinstance(args, str):
+                    try:
+                        parsed = json.loads(args)
+                    except ValueError:
+                        continue
+                    if isinstance(parsed, dict):
+                        return parsed
+                elif isinstance(args, dict):
+                    return args
+
+        raise FusionExecutorError(
+            "OpenRouter fusion payload contained no parseable tool result."
+        )
+
+    @staticmethod
+    def _synthesize_answer(responses: list[dict[str, Any]]) -> str:
+        """Build a fallback answer from panel responses when the judge fails."""
+        parts = [
+            str(r.get("content", "")).strip()
+            for r in responses
+            if str(r.get("content", "")).strip()
+        ]
+        return "\n\n".join(parts)
+
+    def project_cost(
+        self,
+        panel: list[str],
+        judge: str | None,
+        query: str | None = None,
+        prompt_tokens: int | None = None,
+    ) -> float:
+        """Estimate the per-query DOLLAR cost of the panel + judge for the cost guard.
+
+        DOLLARS: the returned value is an estimated per-query dollar cost, NOT a
+        relative unit-price proxy. ``input_price`` / ``output_price`` in
+        ``llm_data`` are per-million-token prices, so for each panel member plus
+        the judge::
+
+            dollars += (input_price * prompt_tokens
+                        + output_price * completion_tokens) / 1e6
+
+        ``prompt_tokens`` is taken from the explicit argument when given, else
+        estimated from ``query`` as ``max(1, len(query) // 4)`` (~4 chars/token),
+        else falls back to ``est_completion_tokens`` when neither is available.
+        ``completion_tokens`` is the config-driven ``est_completion_tokens``
+        default. The ``cost_ceiling`` comparison in both ``route_single`` and
+        ``run`` is made against this dollar projection, so operators set
+        ``cost_ceiling`` in dollars per query.
+        """
+        if prompt_tokens is not None:
+            prompt_toks = max(1, int(prompt_tokens))
+        elif query is not None:
+            prompt_toks = max(1, len(query) // _CHARS_PER_TOKEN)
+        else:
+            prompt_toks = self.est_completion_tokens
+        completion_toks = self.est_completion_tokens
+
+        members = list(panel) + ([judge] if judge else [])
+        total = 0.0
+        for name in members:
+            info = self.llm_data.get(name, {})
+            input_price = float(info.get("input_price", 0.0))
+            output_price = float(info.get("output_price", 0.0))
+            total += (input_price * prompt_toks + output_price * completion_toks) / 1e6
+        return total
diff --git a/custom_routers/fusion_gate/fusion_log.py b/custom_routers/fusion_gate/fusion_log.py
new file mode 100644
index 0000000..e1c9fbc
--- /dev/null
+++ b/custom_routers/fusion_gate/fusion_log.py
@@ -0,0 +1,204 @@
+"""Fusion log sink — structured JSONL logging for fusion calls (UMB-125).
+
+The fusion path produces a panel of model responses plus a judge synthesis. That
+output is the training signal for FusionFactory-style routing data: each panel
+member is a (query, model, response, performance) observation. This module turns
+a :class:`~custom_routers.fusion_gate.executor.FusionResult` into two things:
+
+  * ``log_fusion`` — one append-only structured JSONL line per fusion call,
+    capturing the decision context (query, panel, judge, raw responses,
+    analysis, token, cost) for audit and offline replay.
+  * ``to_training_rows`` — per-model rows decomposed from ``responses[]``, shaped
+    to be consumed by ``llmrouter/data/api_calling_evaluation.py`` (which keys on
+    ``query`` / ``model_name`` / ``response`` / ``performance``).
+
+Secrets hygiene: this sink NEVER serializes the untouched provider payload
+(``FusionResult.raw``) and NEVER writes API keys, auth headers, or PII. Only the
+explicitly enumerated fields below are emitted; everything else is dropped.
+
+Default sink path mirrors the OpenClaw memory bank:
+``~/.llmrouter/openclaw_memory.jsonl`` (override via ``sink_path``).
+
+See: fusion-gate-router-prd-v0.2.0.md, openclaw_router/memory.py,
+llmrouter/data/api_calling_evaluation.py.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from .executor import FusionResult
+
+# Default JSONL sink, shared with the OpenClaw memory bank layout.
+DEFAULT_SINK_PATH = str(Path.home() / ".llmrouter" / "openclaw_memory.jsonl")
+
+# Exact (case-insensitive) key names that mark a mapping entry as
+# credential-bearing. Any matching key is dropped before serialization, at any
+# nesting depth.
+#
+# Exact-match (not substring) is deliberate: substring matching on "token" /
+# "auth" / "session" silently drops legitimate fields like ``prompt_tokens``,
+# ``completion_tokens``, ``author``, ``authentication_method``, and
+# ``session_id`` that may appear in usage/tracing metadata or multi-turn
+# response structures. Actual inline credentials in free text are caught by
+# ``_INLINE_SECRET_RE`` instead, which is the right tool for that job.
+_SECRET_KEYS = frozenset(
+    {
+        "api_key",
+        "apikey",
+        "authorization",
+        "bearer",
+        "secret",
+        "password",
+        "passwd",
+        "credential",
+        "cookie",
+    }
+)
+
+# Inline credential shapes to scrub from free text (e.g. accidental leakage in a
+# model response). Conservative: redact obvious key formats, not arbitrary text.
+_INLINE_SECRET_RE = re.compile(
+    r"\b(sk-[A-Za-z0-9_\-]{12,}|Bearer\s+[A-Za-z0-9._\-]{12,})",
+    re.IGNORECASE,
+)
+
+_REDACTED = "[REDACTED]"
+
+
+def _is_secret_key(key: str) -> bool:
+    """Return True when a mapping key is a known credential-bearing key name."""
+    return key.lower() in _SECRET_KEYS
+
+
+def _scrub(value: Any) -> Any:
+    """Recursively drop secret-keyed entries and redact inline credentials.
+
+    Mappings: keys whose name is in :data:`_SECRET_KEYS` are removed entirely.
+    Strings: inline key/bearer shapes are replaced with ``[REDACTED]``.
+    Other scalars and containers are walked structurally.
+    """
+    if isinstance(value, dict):
+        return {
+            str(k): _scrub(v)
+            for k, v in value.items()
+            if not _is_secret_key(str(k))
+        }
+    if isinstance(value, (list, tuple)):
+        return [_scrub(v) for v in value]
+    if isinstance(value, str):
+        return _INLINE_SECRET_RE.sub(_REDACTED, value)
+    return value
+
+
+def _scrub_response(resp: dict[str, Any]) -> dict[str, Any]:
+    """Normalize one panel response to ``{"model", "content"}``, scrubbed.
+
+    Tolerates the executor's ``{"model", "content"}`` shape while dropping any
+    extra credential-bearing fields a provider payload might carry.
+    """
+    safe = _scrub(resp) if isinstance(resp, dict) else {}
+    return {
+        "model": safe.get("model"),
+        "content": safe.get("content"),
+    }
+
+
+def _utc_now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _resolve_path(sink_path: str | None) -> Path:
+    """Resolve the sink path, expanding ``~`` and environment variables."""
+    raw = (sink_path or "").strip() or DEFAULT_SINK_PATH
+    return Path(os.path.expanduser(os.path.expandvars(raw)))
+
+
+def log_fusion(
+    result: FusionResult,
+    query: str,
+    sink_path: str | None = None,
+    token: int | None = None,
+    cost: float | None = None,
+) -> Path:
+    """Append one structured JSONL entry describing a fusion call.
+
+    Args:
+        result: Parsed fusion output (panel responses, analysis, judge, cost).
+        query: The user query that triggered the fusion call.
+        sink_path: Target JSONL file. Defaults to
+            ``~/.llmrouter/openclaw_memory.jsonl``. ``~`` / env vars are expanded.
+        token: Total token count for the call, when known. Falls back to None.
+        cost: Total cost for the call. Falls back to ``result.cost`` when None.
+
+    Returns:
+        The resolved :class:`~pathlib.Path` the entry was appended to.
+
+    Notes:
+        The provider's raw payload (``result.raw``) is intentionally NOT written.
+        All emitted fields are scrubbed for credential-bearing keys and inline
+        secret shapes; no API keys, auth headers, or PII are persisted.
+    """
+    path = _resolve_path(sink_path)
+
+    record = {
+        "ts": _utc_now_iso(),
+        "strategy": "fusion",
+        "query": query,
+        "panel": list(result.panel),
+        "judge": result.judge,
+        "responses": [_scrub_response(r) for r in result.responses],
+        "analysis": _scrub(result.analysis) if result.analysis is not None else None,
+        "token": token,
+        "cost": cost if cost is not None else result.cost,
+    }
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+    return path
+
+
+def to_training_rows(result: FusionResult, query: str) -> list[dict[str, Any]]:
+    """Decompose ``responses[]`` into per-model FusionFactory training rows.
+
+    Each panel response becomes one row keyed to match the schema produced by
+    ``llmrouter/data/api_calling_evaluation.py``: ``query`` / ``model_name`` /
+    ``response`` / ``performance``. The ``model`` alias is included alongside
+    ``model_name`` so the rows also satisfy the OpenClaw memory layout, which
+    keys on ``query`` / ``model``.
+
+    Args:
+        result: Parsed fusion output containing the panel ``responses[]``.
+        query: The user query that produced the responses.
+
+    Returns:
+        One dict per panel response. ``performance`` defaults to ``None`` because
+        fusion responses are not graded at log time; an offline evaluator fills
+        it in. Content is scrubbed of inline secrets.
+
+    Notes:
+        No API keys, auth headers, or PII are emitted.
+    """
+    rows: list[dict[str, Any]] = []
+    for resp in result.responses:
+        safe = _scrub_response(resp)
+        model = safe.get("model")
+        rows.append(
+            {
+                "query": query,
+                "model_name": model,
+                "model": model,
+                "response": safe.get("content"),
+                "performance": None,
+                "strategy": "fusion",
+                "judge": result.judge,
+            }
+        )
+    return rows
diff --git a/custom_routers/fusion_gate/gate.py b/custom_routers/fusion_gate/gate.py
new file mode 100644
index 0000000..c019dbb
--- /dev/null
+++ b/custom_routers/fusion_gate/gate.py
@@ -0,0 +1,328 @@
+"""RouteGate — the single-vs-fusion decision (UMB-119).
+
+The gate decides, per query, whether to take the cheap SINGLE-model path or
+escalate to the FUSION path. The decision is driven by two scalars in [0, 1]:
+
+  difficulty  — how hard the query is (higher => more likely to fuse)
+  confidence  — how sure the gate is in its single-vs-fusion call
+
+Difficulty estimation follows LLMRouter's ``ThresholdRouter``
+(``custom_routers/thresholdrouter/router.py``) two-mode design:
+
+  1. Injected estimator (preferred). When the caller supplies a query embedding
+     via ``query_input['embedding']`` AND an estimator is wired in, the gate
+     defers to the learned ``DifficultyEstimator``. The estimator is duck-typed
+     (any callable ``embedding -> score``) so this module needs no torch import
+     and stays unit-testable with no trained model present.
+
+  2. Lexical fallback (always available). A deterministic, documented heuristic
+     over the raw query text — length, code/math markers, multi-part questions.
+     This guarantees the gate runs end-to-end with no embedding and no model.
+
+The estimator and the lexical heuristic are kept as separate methods so each is
+independently unit-testable. ``GateDecision`` is extended additively only.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from typing import Any, Callable, Literal
+
+# Three-tier dial (UMB-124). The middle tier escalates a mid-difficulty query to
+# a *cheap* Budget fusion panel; the top tier escalates the hardest queries to
+# the full Quality fusion panel. ``"fusion"`` is retained as an alias-free
+# explicit Quality tier so existing callers/tests that branch on
+# ``tier == "fusion"`` keep working. Extension is additive: a new value is added
+# between the existing ones, not a rename.
+Tier = Literal["single", "budget_fusion", "fusion"]
+
+# Tiers that take the FUSION path (panel + judge) rather than single routing.
+FUSION_TIERS: frozenset[str] = frozenset({"budget_fusion", "fusion"})
+
+# Maps a fusion tier to the panel preset used when capability data is missing.
+# Only the mid tier has a fixed preset here: ``budget_fusion`` always falls back
+# to the cheap Budget panel. The top ``fusion`` tier is intentionally absent so
+# ``TIER_TO_PRESET.get(tier, self.panel_preset)`` resolves it to the router's
+# configured ``panel_preset`` (Quality by default) — a hardcoded "fusion":
+# "Quality" entry here would be dead data, always overridden by panel_preset.
+TIER_TO_PRESET: dict[str, str] = {
+    "budget_fusion": "Budget",
+}
+
+
+def resolve_preset(tier: str, default_preset: str) -> str:
+    """Resolve the panel preset for ``tier``, falling back to ``default_preset``.
+
+    Single source of truth for the tier->preset mapping shared by
+    ``FusionGateRouter._select_panel`` and the eval harness, so the two cannot
+    silently diverge. The mid ``budget_fusion`` tier maps to the cheap Budget
+    panel via ``TIER_TO_PRESET``; every other tier (notably the top ``fusion``
+    tier, deliberately absent from ``TIER_TO_PRESET``) resolves to the caller's
+    configured ``default_preset`` (typically ``panel_preset``, Quality by
+    default).
+
+    Args:
+        tier: The gate-decided tier (e.g. ``"budget_fusion"`` / ``"fusion"``).
+        default_preset: Preset to use when ``tier`` has no fixed mapping.
+
+    Returns:
+        The preset name (e.g. ``"Budget"`` / ``"Quality"``).
+    """
+    return TIER_TO_PRESET.get(tier, default_preset)
+
+# --- lexical heuristic tuning constants (documented, deterministic) ----------
+# Difficulty is a weighted blend of independent signals, each normalized to
+# [0, 1], then clamped. Weights sum to 1.0 so difficulty stays in [0, 1].
+_LENGTH_SATURATION_CHARS = 400.0  # query length at which the length signal hits 1.0
+_LENGTH_WEIGHT = 0.40
+_CODE_MATH_WEIGHT = 0.35
+_MULTIPART_WEIGHT = 0.25
+_MULTIPART_SATURATION = 3.0  # number of sub-questions at which the signal hits 1.0
+
+# Markers that indicate code or math content (case-insensitive substring match
+# for words; symbol matches are literal). Deliberately conservative.
+_CODE_MATH_KEYWORDS = (
+    "code",
+    "function",
+    "algorithm",
+    "compile",
+    "debug",
+    "regex",
+    "integral",
+    "derivative",
+    "theorem",
+    "proof",
+    "equation",
+    "matrix",
+    "complexity",
+)
+_CODE_MATH_SYMBOLS = ("```", "def ", "class ", "{", "}", ";", "=>", "==", "->", "\\", "^", "∫", "∑", "√")
+
+
+@dataclass
+class GateDecision:
+    """Result of the gate.
+
+    tier        : "single" or "fusion"
+    model_name  : chosen model when tier == "single" (None for fusion)
+    panel       : optional pre-selected panel for fusion (UMB-123 may fill this);
+                  when empty, the router/executor fall back to the preset
+    difficulty  : estimated difficulty in [0, 1] (for logging / threshold tuning)
+    confidence  : router confidence in [0, 1]
+    """
+
+    tier: Tier
+    model_name: str | None = None
+    panel: list[str] = field(default_factory=list)
+    difficulty: float = 0.0
+    confidence: float = 1.0
+
+
+class RouteGate:
+    """Route-vs-fuse gate.
+
+    Args:
+        llm_data: name -> candidate-metadata mapping (from default_llm.json).
+        threshold: difficulty cutoff to escalate single -> fusion. Sourced from
+            the router YAML and injected by ``FusionGateRouter``; never hardcoded
+            here beyond a permissive default for standalone construction.
+        estimator: optional learned difficulty estimator. Any callable mapping a
+            query embedding to a difficulty score in [0, 1]. Duck-typed so no
+            torch dependency is introduced; a scalar is extracted via ``.item()``
+            when the return value exposes it (e.g. a 0-d tensor).
+    """
+
+    def __init__(
+        self,
+        llm_data: dict[str, Any],
+        threshold: float = 0.5,
+        estimator: Callable[[Any], Any] | None = None,
+        budget_threshold: float | None = None,
+    ):
+        self.llm_data = llm_data
+        self.llm_names = list(llm_data.keys())
+        self.threshold = threshold
+        self.estimator = estimator
+        # Three-tier dial (UMB-124). ``budget_threshold`` is the LOWER boundary:
+        #   difficulty <  budget_threshold              -> single
+        #   budget_threshold <= difficulty < threshold  -> budget_fusion (cheap)
+        #   difficulty >= threshold                      -> fusion (Quality)
+        # When ``budget_threshold`` is None (or >= threshold) the middle tier is
+        # disabled and the gate degrades to the original two-tier single/fusion
+        # behavior, so existing two-threshold-free configs are unaffected.
+        if budget_threshold is None or budget_threshold >= threshold:
+            self.budget_threshold = threshold
+        else:
+            self.budget_threshold = budget_threshold
+
+    def decide(self, query_input: dict) -> GateDecision:
+        """Decide single-vs-fusion for one query.
+
+        Rules (three-tier dial, UMB-124):
+          - ``high_stakes`` forces the full Quality ``fusion`` tier regardless of
+            difficulty (max confidence in the fusion call, since the caller has
+            overridden the gate).
+          - otherwise estimate difficulty (injected estimator if an embedding is
+            present, else the lexical fallback) and place it against the two
+            thresholds:
+              * difficulty >= ``threshold``            => ``fusion`` (Quality panel)
+              * ``budget_threshold`` <= difficulty     => ``budget_fusion`` (cheap)
+              * difficulty < ``budget_threshold``      => ``single`` (cheapest model)
+          - confidence is derived from the margin to ``threshold`` (see
+            ``_confidence``). When the middle tier is disabled
+            (``budget_threshold == threshold``) this collapses to single/fusion.
+        """
+        if query_input.get("high_stakes"):
+            # Caller override: fuse, and report difficulty if we can still compute
+            # it for logging, but the decision itself is forced with full confidence.
+            difficulty = self._difficulty(query_input)
+            return GateDecision(
+                tier="fusion",
+                difficulty=difficulty,
+                confidence=1.0,
+            )
+
+        difficulty = self._difficulty(query_input)
+        confidence = self._confidence(difficulty)
+
+        if difficulty >= self.threshold:
+            return GateDecision(
+                tier="fusion",
+                difficulty=difficulty,
+                confidence=confidence,
+            )
+
+        if difficulty >= self.budget_threshold:
+            return GateDecision(
+                tier="budget_fusion",
+                difficulty=difficulty,
+                confidence=confidence,
+            )
+
+        return GateDecision(
+            tier="single",
+            model_name=self._cheapest_model(),
+            difficulty=difficulty,
+            confidence=confidence,
+        )
+
+    # ----------------------------------------------------------- difficulty
+
+    def _difficulty(self, query_input: dict) -> float:
+        """Estimate difficulty in [0, 1] for a query.
+
+        Prefers the injected estimator when both an embedding and an estimator
+        are available; otherwise falls back to the deterministic lexical
+        heuristic over the raw query text.
+        """
+        embedding = query_input.get("embedding")
+        if embedding is not None and self.estimator is not None:
+            return self._estimate_with_model(embedding)
+        return self._lexical_difficulty(query_input.get("query", ""))
+
+    def _estimate_with_model(self, embedding: Any) -> float:
+        """Run the injected estimator and coerce its output to a clamped float.
+
+        The estimator is duck-typed: any callable returning either a Python float
+        or an object exposing ``.item()`` (e.g. a 0-d / 1-element tensor). This
+        mirrors ``ThresholdRouter._estimate_difficulty`` without importing torch.
+        """
+        score = self.estimator(embedding)
+        if hasattr(score, "item"):
+            score = score.item()
+        return self._clamp(float(score))
+
+    def _lexical_difficulty(self, query: str) -> float:
+        """Deterministic lexical difficulty heuristic (no model required).
+
+        Blends three normalized signals:
+          - length      : ``len(query) / 400`` clamped to 1.0.
+          - code/math    : 1.0 if any code/math keyword or symbol is present,
+                           else 0.0.
+          - multi-part   : count of sub-questions (``?`` plus enumerated/"and"-joined
+                           clauses) normalized by ``_MULTIPART_SATURATION``.
+
+        The blend is a fixed-weight convex combination, so the result is always
+        in [0, 1] and fully reproducible. Kept pure (text in, float out) for
+        unit testing.
+        """
+        if not query:
+            return 0.0
+
+        length_signal = self._clamp(len(query) / _LENGTH_SATURATION_CHARS)
+        code_math_signal = 1.0 if self._has_code_or_math(query) else 0.0
+        multipart_signal = self._clamp(self._count_subquestions(query) / _MULTIPART_SATURATION)
+
+        difficulty = (
+            _LENGTH_WEIGHT * length_signal
+            + _CODE_MATH_WEIGHT * code_math_signal
+            + _MULTIPART_WEIGHT * multipart_signal
+        )
+        return self._clamp(difficulty)
+
+    @staticmethod
+    def _has_code_or_math(query: str) -> bool:
+        """True if the query contains a code/math keyword or symbol."""
+        lowered = query.lower()
+        if any(keyword in lowered for keyword in _CODE_MATH_KEYWORDS):
+            return True
+        return any(symbol in query for symbol in _CODE_MATH_SYMBOLS)
+
+    @staticmethod
+    def _count_subquestions(query: str) -> int:
+        """Count distinct sub-questions / parts in a query.
+
+        Heuristic: the larger of (a) the number of '?' characters and
+        (b) 1 + the number of enumerated parts or coordinating " and "/" ; "
+        separators. A single simple question therefore counts as 1.
+        """
+        question_marks = query.count("?")
+        # Enumerated parts: "1.", "2)", "- ", or coordinating separators.
+        enumerations = len(re.findall(r"(?:\b\d+[.)]\s)|(?:\s;\s)|(?:\sand\s)", query))
+        parts = max(question_marks, 1 + enumerations)
+        return parts
+
+    # ----------------------------------------------------------- confidence
+
+    def _confidence(self, difficulty: float) -> float:
+        """Derive confidence in [0, 1] from the margin to the threshold.
+
+        Intuition: the gate is most confident when difficulty sits far from the
+        threshold (clearly easy or clearly hard) and least confident right at the
+        boundary. We normalize the absolute margin by the larger side of the
+        threshold split so both an easy and a hard query can reach full
+        confidence at the extremes.
+        """
+        margin = abs(difficulty - self.threshold)
+        span = max(self.threshold, 1.0 - self.threshold)
+        if span <= 0.0:
+            return 1.0
+        return self._clamp(margin / span)
+
+    # ----------------------------------------------------------- selection
+
+    def cheapest_model(self) -> str:
+        """Pick the lowest-cost candidate as the single-path default (public API).
+
+        Public entry point for callers outside this class (the router's downgrade
+        guard, the eval harness). Delegates to the private implementation.
+        """
+        return self._cheapest_model()
+
+    def _cheapest_model(self) -> str:
+        """Pick the lowest-cost candidate as the single-path default."""
+
+        def cost(name: str) -> float:
+            info = self.llm_data.get(name, {})
+            # default_llm.json uses input_price / output_price; fall back gracefully
+            return float(info.get("input_price", 0.0)) + float(info.get("output_price", 0.0))
+
+        return min(self.llm_names, key=cost) if self.llm_names else ""
+
+    # ----------------------------------------------------------- utilities
+
+    @staticmethod
+    def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
+        """Clamp ``value`` into the closed interval [low, high]."""
+        return max(low, min(high, value))
diff --git a/custom_routers/fusion_gate/router.py b/custom_routers/fusion_gate/router.py
new file mode 100644
index 0000000..75f5fe6
--- /dev/null
+++ b/custom_routers/fusion_gate/router.py
@@ -0,0 +1,270 @@
+"""FusionGateRouter — route-vs-fuse meta-router for LLMRouter.
+
+Integration entry point (UMB-118/121/123/124). Implements the MetaRouter
+contract (``route_single`` / ``route_batch``) and owns:
+
+  - UMB-121: reading and respecting all six config keys (``threshold``, ``k``,
+    ``judge``, ``provider``/``base_url``, ``panel_preset``, ``cost_ceiling``),
+    plus a spend-free ``--route-only`` path and the cost guard.
+  - UMB-123: capability-scored panel selection via ``CapabilityScorer``, with a
+    preset (Quality / Budget) fallback when capability data is unavailable.
+  - UMB-124: a three-tier dial — ``single`` -> ``budget_fusion`` (cheap panel)
+    -> ``fusion`` (full Quality panel) — threaded from the gate through to panel
+    selection.
+
+The router's only job is to DECIDE, per query, between:
+  - the cheap SINGLE-model path (classic LLMRouter routing),
+  - a cheap BUDGET-fusion panel, or
+  - the full QUALITY-fusion panel (OpenRouter ``openrouter:fusion`` server tool).
+
+Routing never spends: ``route_single`` returns a decision dict (no API call).
+Spend happens only in ``fuse()``, which is invoked separately and logs every
+call via ``fusion_log.log_fusion``.
+
+See: fusion-gate-router-prd-v0.2.0.md
+"""
+
+from __future__ import annotations
+
+import sys
+from typing import Any
+
+# torch is a GENUINE transitive requirement of this module: MetaRouter subclasses
+# torch.nn.Module, so importing FusionGateRouter requires torch even though the
+# gate itself runs no inference. This import is intentionally eager (not lazy) so
+# the dependency fails loudly at import time rather than mid-route. Torch-free
+# unit tests therefore load gate.py / executor.py / fusion_log.py by file path,
+# never through this module — see custom_routers/fusion_gate/tests/.
+import torch.nn as nn
+
+from llmrouter.models.meta_router import MetaRouter
+
+from .capability import CapabilityScorer
+from .executor import CostCeilingExceeded, FusionExecutor, FusionResult
+from .fusion_log import log_fusion
+from .gate import FUSION_TIERS, GateDecision, RouteGate, resolve_preset
+
+
+class FusionGateRouter(MetaRouter):
+    """Meta-router that gates each query between single routing and fusion tiers.
+
+    Decision contract returned by ``route_single``:
+
+      single path → {"query", "strategy": "single", "tier": "single",
+                     "model_name", "predicted_llm", "difficulty", "confidence"}
+      fusion path → {"query", "strategy": "fusion", "tier": "budget_fusion"|"fusion",
+                     "panel": [...], "judge": ..., "model_name", "predicted_llm",
+                     "difficulty", "confidence", "projected_cost"}
+
+    Both shapes carry ``strategy`` and ``tier`` so downstream code can branch, and
+    both carry ``model_name`` for drop-in compatibility with the CLI's
+    ``route_query`` (which keys on ``model_name`` / ``predicted_llm``). The fusion
+    path's ``model_name`` is the judge (or panel head) purely as a label — no API
+    call is made during routing.
+    """
+
+    def __init__(self, yaml_path: str):
+        # MetaRouter manages config/LLM-candidate loading. A simple gate needs no
+        # trainable model, so Identity stands in until a learned gate lands.
+        model = nn.Identity()
+        super().__init__(model=model, yaml_path=yaml_path)
+
+        # Available candidate LLMs (name -> metadata dict from default_llm.json).
+        self.llm_names: list[str] = list(self.llm_data.keys())
+
+        # ------------------------------------------------------ config (UMB-121)
+        # All six config keys are read here and respected downstream. `.get`
+        # keeps construction robust if a key is omitted.
+        hparam: dict[str, Any] = self.cfg.get("hparam", {}) or {}
+        self.threshold: float = float(hparam.get("threshold", 0.5))
+        self.k: int = int(hparam.get("k", 3))
+        self.judge: str | None = hparam.get("judge")
+        self.panel_preset: str = hparam.get("panel_preset", "Quality")
+        cost_ceiling = hparam.get("cost_ceiling")
+        self.cost_ceiling: float | None = (
+            float(cost_ceiling) if cost_ceiling is not None else None
+        )
+        # Per-completion output-token estimate feeding the dollar cost projection.
+        self.est_completion_tokens: int = int(hparam.get("est_completion_tokens", 512))
+        # Three-tier dial (UMB-124): optional lower boundary for the middle tier.
+        budget_threshold = hparam.get("budget_threshold")
+        self.budget_threshold: float | None = (
+            float(budget_threshold) if budget_threshold is not None else None
+        )
+
+        # Provider / base_url (UMB-121). ``base_url`` is the OpenRouter endpoint
+        # for the beta server tool; ``provider`` is informational and resolved by
+        # the executor for key lookup. ``base_url`` takes precedence over the
+        # legacy top-level ``api_endpoint``.
+        self.provider: str | None = hparam.get("provider") or self.cfg.get("provider")
+        self.base_url: str | None = (
+            hparam.get("base_url")
+            or self.cfg.get("base_url")
+            or self.cfg.get("api_endpoint")
+        )
+
+        # Optional JSONL sink for fusion logging (defaults inside fusion_log).
+        self.log_sink_path: str | None = hparam.get("log_sink_path")
+
+        # ----------------------------------------------------------- seams
+        self.gate = RouteGate(
+            llm_data=self.llm_data,
+            threshold=self.threshold,
+            budget_threshold=self.budget_threshold,
+        )
+        # Capability scorer (UMB-123) sources per-model performance from the
+        # routing data the DataLoader attached (DataFrame or None). Prefer the
+        # train split (richer); fall back to test split.
+        routing_data = getattr(self, "routing_data_train", None)
+        if routing_data is None:
+            routing_data = getattr(self, "routing_data_test", None)
+        self.capability = CapabilityScorer(
+            llm_data=self.llm_data,
+            routing_data=routing_data,
+        )
+        self.executor = FusionExecutor(
+            llm_data=self.llm_data,
+            judge=self.judge,
+            panel_preset=self.panel_preset,
+            cost_ceiling=self.cost_ceiling,
+            api_endpoint=self.base_url,
+            est_completion_tokens=self.est_completion_tokens,
+        )
+
+    # ------------------------------------------------------------------ routing
+
+    def route_single(self, query_input: dict) -> dict:
+        """Route one query: decide tier, then select the panel if fusing.
+
+        SPEND-FREE: this only computes a decision. No OpenRouter call is made
+        here, so ``--route-only`` (and the normal CLI route step) never spend.
+        For the fusion tiers the intended panel/judge and a projected cost are
+        included so callers can audit the plan before invoking ``fuse()``.
+        """
+        query = query_input["query"]
+
+        decision: GateDecision = self.gate.decide(query_input)
+
+        if decision.tier not in FUSION_TIERS:
+            return {
+                "query": query,
+                "strategy": "single",
+                "tier": decision.tier,
+                "model_name": decision.model_name,
+                "predicted_llm": decision.model_name,
+                "difficulty": decision.difficulty,
+                "confidence": decision.confidence,
+            }
+
+        # Fusion tier: select the panel (UMB-123/124). The judge is the
+        # config-driven slug (None => the executor uses the outer model).
+        panel = self._select_panel(query_input, decision)
+        judge = self.judge
+
+        # Cost guard (UMB-121): when the projected Σ(panel)+judge exceeds the
+        # ceiling, abort fusion by DOWNGRADING to the cheap single path rather
+        # than spending. The downgrade is reported via ``tier``/``downgraded``.
+        projected = self.executor.project_cost(panel, judge, query=query)
+        if self.cost_ceiling is not None and projected > self.cost_ceiling:
+            fallback_model = self.gate.cheapest_model()
+            return {
+                "query": query,
+                "strategy": "single",
+                "tier": "single",
+                "downgraded_from": decision.tier,
+                "model_name": fallback_model,
+                "predicted_llm": fallback_model,
+                "difficulty": decision.difficulty,
+                "confidence": decision.confidence,
+                "projected_cost": projected,
+                "cost_ceiling": self.cost_ceiling,
+            }
+
+        return {
+            "query": query,
+            "strategy": "fusion",
+            "tier": decision.tier,
+            "panel": panel,
+            "judge": judge,
+            # Label only (the judge/outer model); no API call is made in routing.
+            "model_name": judge or (panel[0] if panel else None),
+            "predicted_llm": judge or (panel[0] if panel else None),
+            "difficulty": decision.difficulty,
+            "confidence": decision.confidence,
+            "projected_cost": projected,
+        }
+
+    def route_batch(self, batch: list) -> list:
+        """Route multiple queries."""
+        return [self.route_single(q) for q in batch]
+
+    # ----------------------------------------------------------------- internals
+
+    def _select_panel(self, query_input: dict, decision: GateDecision) -> list[str]:
+        """Pick the fusion panel (maps to the tool's ``analysis_models``).
+
+        UMB-123/124: capability-scored, tier-aware selection.
+          - The capability scorer (UMB-123) scores candidates for the query's
+            category and returns the top-k; panel membership therefore varies by
+            query type (code/math/reasoning vs general).
+          - The tier (UMB-124) selects the fallback preset when capability data
+            is unavailable: ``budget_fusion`` -> Budget, ``fusion`` -> Quality.
+          - A panel pre-selected on the gate decision wins, if present.
+        """
+        if decision.panel:
+            return decision.panel
+
+        query = query_input.get("query", "")
+        panel = self.capability.select_panel(query, self.k)
+        if panel:
+            return panel
+
+        # Fallback: capability data unavailable for this query -> preset panel.
+        # The tier dictates the preset: the mid ``budget_fusion`` tier maps to the
+        # cheap Budget panel; any other tier (the top ``fusion`` tier) resolves to
+        # the router's configured ``panel_preset``. Resolution is delegated to
+        # ``gate.resolve_preset`` — the single source of truth shared with the
+        # eval harness so the two cannot silently diverge.
+        preset = resolve_preset(decision.tier, self.panel_preset)
+        return self.capability.preset_panel(preset, self.k)
+
+    # --------------------------------------------------------------- execution
+
+    def fuse(self, route_result: dict, **gen_kwargs: Any) -> FusionResult:
+        """Execute a fusion decision via the FusionExecutor (UMB-120).
+
+        Kept separate from ``route_single`` so ``--route-only`` (UMB-121) can
+        return the decision without ever calling this. Every fusion call is
+        logged via ``fusion_log.log_fusion`` (UMB-125) — secret-scrubbed,
+        raw-payload-free.
+
+        Raises:
+            ValueError: if called on a non-fusion route result.
+            CostCeilingExceeded: re-raised from the executor's pre-call guard.
+        """
+        if route_result.get("strategy") != "fusion":
+            raise ValueError("fuse() called on a non-fusion route result")
+
+        query = route_result["query"]
+        result = self.executor.run(
+            query=query,
+            panel=route_result["panel"],
+            judge=route_result.get("judge"),
+            **gen_kwargs,
+        )
+
+        # Log every fusion call (audit + training signal). A logging failure
+        # (disk-full, permission-denied, NFS timeout) must NEVER destroy the
+        # already-computed result, so the append is best-effort: any exception is
+        # swallowed (reported to stderr) and the result is returned regardless.
+        # The sink is append-only and secret-scrubbed inside fusion_log.
+        try:
+            log_fusion(result, query=query, sink_path=self.log_sink_path, cost=result.cost)
+        except Exception as exc:  # noqa: BLE001 - logging must not lose the result
+            # Report the failure type only; never echo the path/secret-bearing detail.
+            print(
+                f"fusion_log: failed to persist fusion call ({type(exc).__name__}); "
+                "continuing without losing the result.",
+                file=sys.stderr,
+            )
+        return result
diff --git a/custom_routers/fusion_gate/tests/__init__.py b/custom_routers/fusion_gate/tests/__init__.py
new file mode 100644
index 0000000..078ea56
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for the fusion_gate plugin."""
diff --git a/custom_routers/fusion_gate/tests/conftest.py b/custom_routers/fusion_gate/tests/conftest.py
new file mode 100644
index 0000000..1d90e5d
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/conftest.py
@@ -0,0 +1,37 @@
+"""pytest bootstrap for the fusion_gate test suite.
+
+Makes ``pytest custom_routers/fusion_gate/tests/`` work out of the box, without
+relying on the standalone ``python test_gate.py`` runner as a workaround.
+
+Two things are guaranteed here:
+
+  1. The repo root is on ``sys.path`` so ``custom_routers`` resolves as a package
+     (the torch-dependent ``test_router`` imports ``custom_routers.fusion_gate.router``
+     directly).
+  2. ``--import-mode=importlib`` is enabled so pytest does not rewrite ``sys.path``
+     in ways that re-trigger package ``__init__`` collection. Combined with the
+     lazy ``__getattr__`` in ``fusion_gate/__init__.py``, the four torch-free test
+     modules (gate / executor / capability / fusion_log / eval_harness) collect and
+     run with no torch installed.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+# Repo root = three levels up from this file (tests/ -> fusion_gate/ ->
+# custom_routers/ -> repo root).
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+
+def pytest_configure(config) -> None:
+    """Force importlib import mode so package collection stays torch-free.
+
+    Setting it here (rather than only in pytest.ini) keeps the behavior local to
+    this plugin's tests and avoids editing any repo-level config outside
+    custom_routers/fusion_gate/.
+    """
+    config.option.importmode = "importlib"
diff --git a/custom_routers/fusion_gate/tests/test_capability.py b/custom_routers/fusion_gate/tests/test_capability.py
new file mode 100644
index 0000000..fd1ec56
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_capability.py
@@ -0,0 +1,197 @@
+"""Offline unit tests for ``CapabilityScorer`` (UMB-123).
+
+Fully offline: no network, no torch, no trained model, and no large data files.
+``capability.py`` is loaded directly by file path (like ``test_gate.py``) so the
+package ``__init__`` — which pulls in ``router.py``/torch — is never imported.
+
+Coverage:
+  - panel membership VARIES by query type (code/math/reasoning vs general) when
+    backed by per-category routing performance
+  - top-k respected; k clamped against the candidate set
+  - preset fallback (Quality vs Budget) resolves by price
+  - ``select_panel`` returns None (-> preset fallback) when no capability data
+    and llm_data carries no usable prior
+  - task_name -> category bucketing
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import os
+import sys
+from typing import Any
+
+_CAP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "capability.py"))
+_spec = importlib.util.spec_from_file_location("fusion_gate_capability", _CAP_PATH)
+assert _spec is not None and _spec.loader is not None
+_cap_mod = importlib.util.module_from_spec(_spec)
+sys.modules[_spec.name] = _cap_mod
+_spec.loader.exec_module(_cap_mod)
+
+CapabilityScorer = _cap_mod.CapabilityScorer
+
+# Candidate set mirroring default_llm.json shape (size / feature / prices).
+LLM_DATA: dict[str, dict[str, Any]] = {
+    "cheap-7b": {
+        "size": "7B",
+        "feature": "fast and efficient small model",
+        "input_price": 0.20,
+        "output_price": 0.20,
+    },
+    "mid-49b": {
+        "size": "49B",
+        "feature": "powerful high-accuracy model for complex tasks",
+        "input_price": 0.90,
+        "output_price": 0.90,
+    },
+    "big-141b": {
+        "size": "141B",
+        "feature": "advanced large-scale model with exceptional performance",
+        "input_price": 1.20,
+        "output_price": 1.20,
+    },
+    "moe-45b": {
+        "size": "45B",
+        "feature": "mixture of experts optimized for creative generation",
+        "input_price": 0.60,
+        "output_price": 0.60,
+    },
+}
+
+# Routing rows that make different models best at different categories so the
+# panel is forced to vary by query type. cheap-7b dominates "code", big-141b
+# dominates "reasoning"/"math".
+ROUTING_ROWS = [
+    {"task_name": "humaneval-code", "model_name": "cheap-7b", "performance": 0.95},
+    {"task_name": "humaneval-code", "model_name": "mid-49b", "performance": 0.30},
+    {"task_name": "humaneval-code", "model_name": "big-141b", "performance": 0.20},
+    {"task_name": "humaneval-code", "model_name": "moe-45b", "performance": 0.40},
+    {"task_name": "agentverse-logicgrid", "model_name": "cheap-7b", "performance": 0.10},
+    {"task_name": "agentverse-logicgrid", "model_name": "mid-49b", "performance": 0.50},
+    {"task_name": "agentverse-logicgrid", "model_name": "big-141b", "performance": 0.98},
+    {"task_name": "agentverse-logicgrid", "model_name": "moe-45b", "performance": 0.40},
+]
+
+
+def _scorer(routing_data: Any = None) -> CapabilityScorer:
+    return CapabilityScorer(llm_data=LLM_DATA, routing_data=routing_data or ROUTING_ROWS)
+
+
+# ----------------------------------------------------------- query classification
+
+
+def test_classify_query_categories():
+    s = _scorer()
+    assert s.classify_query("Write a python function to debug this code") == "code"
+    assert s.classify_query("Compute the integral and prove the theorem") == "math"
+    assert s.classify_query("Solve this logic puzzle step by step") == "reasoning"
+    assert s.classify_query("What is the capital of France?") == "general"
+    assert s.classify_query("") == "general"
+
+
+# ------------------------------------------------------------- panel variation
+
+
+def test_panel_varies_by_query_type():
+    """A code query and a reasoning query must yield different panels."""
+    s = _scorer()
+    code_panel = s.select_panel("Write a function to fix this bug in my code", k=2)
+    reasoning_panel = s.select_panel("Solve this logic puzzle, reason step by step", k=2)
+
+    assert code_panel is not None and reasoning_panel is not None
+    # cheap-7b is best at code; big-141b is best at reasoning.
+    assert code_panel[0] == "cheap-7b"
+    assert reasoning_panel[0] == "big-141b"
+    assert code_panel != reasoning_panel
+
+
+def test_top_k_respected_and_clamped():
+    s = _scorer()
+    assert len(s.select_panel("debug this code", k=2)) == 2
+    # k larger than candidate count returns all candidates, not an error.
+    full = s.select_panel("debug this code", k=99)
+    assert len(full) == len(LLM_DATA)
+    # k <= 0 -> None (preset fallback trigger).
+    assert s.select_panel("debug this code", k=0) is None
+
+
+# ------------------------------------------------------------------ fallback
+
+
+def test_select_panel_returns_none_without_any_capability_signal():
+    """No routing data AND no llm_data prior => None (preset fallback)."""
+    s = CapabilityScorer(llm_data={}, routing_data=None)
+    assert s.select_panel("anything", k=3) is None
+
+
+def test_preset_panel_quality_vs_budget_by_price():
+    s = _scorer()
+    quality = s.preset_panel("Quality", k=2)
+    budget = s.preset_panel("Budget", k=2)
+
+    # Quality favors most-capable (highest price proxy) first.
+    assert quality[0] == "big-141b"
+    # Budget favors cheapest first.
+    assert budget[0] == "cheap-7b"
+    assert quality != budget
+
+
+def test_static_prior_used_when_routing_data_absent():
+    """Without routing data, scoring still differentiates via the llm_data prior."""
+    s = CapabilityScorer(llm_data=LLM_DATA, routing_data=None)
+    panel = s.select_panel("general knowledge question", k=2)
+    assert panel is not None
+    # Largest/most-capable model ranks first via the size/feature prior.
+    assert panel[0] == "big-141b"
+
+
+# ---------------------------------------------------------- task bucketing
+
+
+def test_task_name_to_category_bucketing():
+    s = _scorer()
+    assert s._task_to_category("humaneval-code") == "code"
+    assert s._task_to_category("agentverse-logicgrid") == "reasoning"
+    assert s._task_to_category("gsm8k") == "math"
+    assert s._task_to_category("trivia-qa") == "general"
+    assert s._task_to_category(None) == "general"
+
+
+def test_dataframe_like_routing_data_is_accepted():
+    """A pandas-like object exposing to_dict(orient='records') is consumed."""
+
+    class _FakeDF:
+        def __init__(self, rows):
+            self._rows = rows
+
+        def to_dict(self, orient="records"):  # noqa: D401 - mirror pandas API
+            assert orient == "records"
+            return self._rows
+
+    s = CapabilityScorer(llm_data=LLM_DATA, routing_data=_FakeDF(ROUTING_ROWS))
+    code_panel = s.select_panel("debug this code", k=1)
+    assert code_panel == ["cheap-7b"]
+
+
+# ----------------------------------------------------------------- runner
+
+
+def _run_all() -> int:
+    tests = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
+    failures = 0
+    for test in tests:
+        try:
+            test()
+            print(f"PASS {test.__name__}")
+        except AssertionError as exc:  # pragma: no cover - reporting path
+            failures += 1
+            print(f"FAIL {test.__name__}: {exc}")
+        except Exception as exc:  # pragma: no cover - reporting path
+            failures += 1
+            print(f"ERROR {test.__name__}: {type(exc).__name__}: {exc}")
+    print(f"\n{len(tests) - failures}/{len(tests)} passed")
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(_run_all())
diff --git a/custom_routers/fusion_gate/tests/test_eval_harness.py b/custom_routers/fusion_gate/tests/test_eval_harness.py
new file mode 100644
index 0000000..15015dd
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_eval_harness.py
@@ -0,0 +1,227 @@
+"""Fast offline tests for the eval + retrain harness (UMB-122/124/126).
+
+These run the MOCK harness end-to-end against the bundled fixtures — zero spend,
+no network. The harness modules are imported by file path (like test_executor.py)
+so importing them never triggers the package __init__ (which pulls in torch via
+router.py); the harness itself is torch-free.
+
+Run: ``pytest custom_routers/fusion_gate/tests/test_eval_harness.py`` or, with no
+pytest installed, ``python custom_routers/fusion_gate/tests/test_eval_harness.py``.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import os
+import sys
+import types
+from pathlib import Path
+
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+_EVAL_DIR = Path(_REPO_ROOT) / "custom_routers" / "fusion_gate" / "eval"
+_FIXTURES = _EVAL_DIR / "fixtures"
+
+
+def _ensure_namespace_packages() -> None:
+    """Register lightweight package stubs so the eval modules' relative imports
+    (``from .eval_harness import ...``) resolve WITHOUT executing
+    ``custom_routers/fusion_gate/__init__.py`` — which imports torch via
+    router.py. The harness is deliberately torch-free, so we route around it."""
+    for pkg_name, pkg_dir in (
+        ("custom_routers", Path(_REPO_ROOT) / "custom_routers"),
+        ("custom_routers.fusion_gate", _EVAL_DIR.parent),
+        ("custom_routers.fusion_gate.eval", _EVAL_DIR),
+    ):
+        if pkg_name not in sys.modules:
+            mod = types.ModuleType(pkg_name)
+            mod.__path__ = [str(pkg_dir)]  # mark as a package
+            sys.modules[pkg_name] = mod
+
+
+def _import_eval_module(name: str, filename: str):
+    """Import an eval module by path under its package alias."""
+    full = f"custom_routers.fusion_gate.eval.{name}"
+    if full in sys.modules:
+        return sys.modules[full]
+    spec = importlib.util.spec_from_file_location(full, str(_EVAL_DIR / filename))
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[full] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+_ensure_namespace_packages()
+eval_harness = _import_eval_module("eval_harness", "eval_harness.py")
+retrain = _import_eval_module("retrain", "retrain.py")
+
+DATASET = str(_FIXTURES / "hard_slice.jsonl")
+LLM = str(_FIXTURES / "llm_candidates.json")
+ROUTING = str(_FIXTURES / "routing_data.jsonl")
+
+
+def _build_harness(**overrides):
+    kwargs = dict(
+        dataset_path=DATASET, llm_path=LLM, routing_path=ROUTING,
+        threshold=0.5, budget_threshold=0.3, k=2, judge=None, panel_preset="Quality",
+    )
+    kwargs.update(overrides)
+    return eval_harness.build_mock_harness(**kwargs)
+
+
+# --------------------------------------------------------- harness end-to-end
+
+
+def test_harness_runs_all_three_arms_offline():
+    harness, dataset, _ = _build_harness()
+    arms = harness.run_all()
+
+    assert set(arms) == {"always_route", "always_fuse", "fusion_gate"}
+    for name, r in arms.items():
+        assert r.n == len(dataset)
+        assert 0.0 <= r.quality <= 1.0
+        assert r.blended_cost >= 0.0
+        assert 0.0 <= r.escalation_p <= 1.0
+
+    # always_route never escalates; always_fuse always escalates.
+    assert arms["always_route"].escalation_p == 0.0
+    assert arms["always_fuse"].escalation_p == 1.0
+
+
+def test_blended_cost_ordering():
+    """Fusion costs more per query than a single model; gate sits in between or below fuse."""
+    harness, _, _ = _build_harness()
+    arms = harness.run_all()
+    assert arms["always_route"].blended_cost < arms["always_fuse"].blended_cost
+    assert arms["fusion_gate"].blended_cost <= arms["always_fuse"].blended_cost
+
+
+def test_m1_m2_m3_verdicts_present_and_pass_on_fixtures():
+    harness, dataset, dataset_path = _build_harness()
+    arms = harness.run_all()
+    verdicts = eval_harness.compute_verdicts(arms)
+
+    # M1: gate quality >= 95% of always-fuse quality.
+    assert verdicts["M1"]["pass"], verdicts["M1"]
+    # M2: blended cost <= 1.6x always-route.
+    assert verdicts["M2"]["pass"], verdicts["M2"]
+    # M3: gate-precision is measured (escalated queries that beat best single).
+    assert verdicts["M3"]["gate_precision"] is not None
+    assert verdicts["M3"]["n_escalated"] >= 1
+
+
+def test_gate_precision_counts_only_improvements():
+    """M3 counts an escalation only when fusion is right AND best single is wrong."""
+    harness, dataset, _ = _build_harness()
+    arm = harness.run_fusion_gate()
+    # On the fixtures, fusion is designed to be correct where the majority single
+    # answer is wrong on at least some escalated queries.
+    assert arm.n_escalated_improved >= 1
+    assert arm.n_escalated_improved <= arm.n_escalated
+
+
+def test_report_and_csv_are_written(tmp_path):
+    rc = eval_harness.main([
+        "--mock", "--dataset", DATASET, "--llm", LLM, "--routing", ROUTING,
+        "--out", str(tmp_path), "--with-retrain",
+    ])
+    assert rc == 0
+    csv_path = tmp_path / "results.csv"
+    md_path = tmp_path / "results.md"
+    assert csv_path.exists()
+    assert md_path.exists()
+
+    md = md_path.read_text(encoding="utf-8")
+    # The report must explicitly flag mock provenance and the keyed live path.
+    assert "MOCK fixtures" in md
+    assert "keyed live run" in md.lower()
+    assert "M1" in md and "M2" in md and "M3" in md
+    # --with-retrain appends the retrain delta block.
+    assert "Retrain" in md and "before vs after" in md
+
+    # CSV has a header + three arm rows.
+    rows = csv_path.read_text(encoding="utf-8").strip().splitlines()
+    assert len(rows) == 4
+    assert rows[0].startswith("arm,")
+
+
+def test_live_mode_is_blocked():
+    """The offline harness refuses --live so a stray run cannot spend."""
+    import pytest as _pytest  # only used when pytest is present
+
+    with _pytest.raises(SystemExit):
+        eval_harness.main(["--live", "--dataset", DATASET, "--llm", LLM])
+
+
+# --------------------------------------------------------- retrain loop
+
+
+def test_retrain_measures_m3_before_and_after_offline():
+    dataset = eval_harness.load_jsonl(DATASET)
+    llm_data = eval_harness.load_llm_candidates(LLM)
+    base_routing = eval_harness.load_jsonl(ROUTING)
+
+    log = retrain.synthesize_fusion_log(dataset, llm_data, base_routing, k=3, judge=None)
+    assert log, "expected a non-empty synthesized fusion log"
+    # Log entries are in fusion_log shape.
+    assert all("responses" in e and "query" in e for e in log)
+
+    result = retrain.run_retrain(dataset, llm_data, base_routing, log=log, k=3, judge=None)
+
+    # Routing table is strictly augmented by the replayed responses.
+    assert result["n_augmented_routing_rows"] > result["n_base_routing_rows"]
+    # Both M3 measurements are produced and the delta is reported.
+    assert result["m3_before"] is not None
+    assert result["m3_after"] is not None
+    assert result["m3_delta"] == result["m3_after"] - result["m3_before"]
+    # Threshold is refit within bounds.
+    assert 0.1 <= result["threshold_after"] <= 0.9
+
+
+def test_retrain_block_renders_with_mock_flag():
+    dataset = eval_harness.load_jsonl(DATASET)
+    block = retrain.mock_retrain_report_block(
+        dataset=dataset, llm_path=LLM, routing_path=ROUTING, k=3, judge=None
+    )
+    assert "Retrain" in block
+    assert "MOCK fixtures" in block
+    assert "M3 gate-precision" in block
+
+
+# --------------------------------------------------------- manual runner
+
+
+def _run_all_manually() -> int:
+    """Run every test_* with no pytest (env without pytest installed)."""
+    import tempfile
+
+    failures = 0
+    for fn_name, fn in sorted(globals().items()):
+        if not fn_name.startswith("test_") or not callable(fn):
+            continue
+        try:
+            if fn_name == "test_report_and_csv_are_written":
+                with tempfile.TemporaryDirectory() as d:
+                    fn(Path(d))
+            elif fn_name == "test_live_mode_is_blocked":
+                # Reproduce the SystemExit assertion without pytest.
+                try:
+                    eval_harness.main(["--live", "--dataset", DATASET, "--llm", LLM])
+                except SystemExit:
+                    pass
+                else:
+                    raise AssertionError("--live should SystemExit")
+            else:
+                fn()
+            print(f"PASS {fn_name}")
+        except Exception as exc:  # noqa: BLE001
+            failures += 1
+            print(f"FAIL {fn_name}: {type(exc).__name__}: {exc}")
+    return failures
+
+
+if __name__ == "__main__":
+    raise SystemExit(1 if _run_all_manually() else 0)
diff --git a/custom_routers/fusion_gate/tests/test_executor.py b/custom_routers/fusion_gate/tests/test_executor.py
new file mode 100644
index 0000000..f74d17b
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_executor.py
@@ -0,0 +1,228 @@
+"""Offline unit tests for FusionExecutor.run (UMB-120).
+
+All tests mock the HTTP layer — no live network/API calls are made. The HTTP
+seam is patched at ``requests.post`` (the executor prefers ``requests`` when it
+imports successfully), so the request body can be inspected and the response
+faked.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import sys
+from typing import Any
+
+import pytest
+
+# Import the executor module directly by file path so these tests stay offline
+# and free of the package __init__ (which imports torch via router.py). The
+# executor itself has no torch dependency. The module is registered in
+# sys.modules before execution so its dataclasses resolve field types.
+_EXECUTOR_PATH = os.path.join(os.path.dirname(__file__), "..", "executor.py")
+_spec = importlib.util.spec_from_file_location("fusion_gate_executor", _EXECUTOR_PATH)
+assert _spec is not None and _spec.loader is not None
+_executor = importlib.util.module_from_spec(_spec)
+sys.modules[_spec.name] = _executor
+_spec.loader.exec_module(_executor)
+
+CostCeilingExceeded = _executor.CostCeilingExceeded
+FusionExecutor = _executor.FusionExecutor
+FusionResult = _executor.FusionResult
+
+PANEL = ["model-a", "model-b", "model-c"]
+JUDGE = "judge-model"
+
+# Per-model unit prices mirroring default_llm.json's input_price/output_price.
+LLM_DATA: dict[str, dict[str, Any]] = {
+    "model-a": {"input_price": 0.20, "output_price": 0.20},
+    "model-b": {"input_price": 0.60, "output_price": 0.60},
+    "model-c": {"input_price": 0.90, "output_price": 0.90},
+    "judge-model": {"input_price": 1.20, "output_price": 1.20},
+}
+
+API_KEYS = {"OpenRouter": "sk-test-key"}
+
+
+class _FakeResponse:
+    """Minimal stand-in for a requests.Response."""
+
+    def __init__(self, payload: dict[str, Any]):
+        self._payload = payload
+
+    def raise_for_status(self) -> None:  # noqa: D401 - mirror requests API
+        return None
+
+    def json(self) -> dict[str, Any]:
+        return self._payload
+
+
+def _make_executor(cost_ceiling: float | None = None) -> FusionExecutor:
+    return FusionExecutor(
+        llm_data=LLM_DATA,
+        judge=JUDGE,
+        cost_ceiling=cost_ceiling,
+    )
+
+
+def _patch_post(monkeypatch, payload: dict[str, Any], captured: dict[str, Any]):
+    """Patch requests.post to capture the body and return ``payload``."""
+
+    def fake_post(url, headers=None, json=None, timeout=None, **kwargs):  # noqa: A002
+        captured["url"] = url
+        captured["headers"] = headers
+        captured["body"] = json
+        captured["timeout"] = timeout
+        return _FakeResponse(payload)
+
+    import requests
+
+    monkeypatch.setattr(requests, "post", fake_post)
+
+
+def test_happy_path_parses_responses_and_analysis(monkeypatch):
+    payload = {
+        "status": "ok",
+        "answer": "Fused answer.",
+        "analysis": {
+            "consensus": "All models agree X.",
+            "contradictions": ["b disagrees on Y"],
+            "blind_spots": ["none flagged Z"],
+        },
+        "responses": [
+            {"model": "model-a", "content": "answer from a"},
+            {"model": "model-b", "content": "answer from b"},
+            {"model": "model-c", "content": "answer from c"},
+        ],
+        "cost": 1.23,
+    }
+    captured: dict[str, Any] = {}
+    _patch_post(monkeypatch, payload, captured)
+
+    result = _make_executor().run("What is 2+2?", PANEL, api_keys=API_KEYS)
+
+    assert isinstance(result, FusionResult)
+    assert result.answer == "Fused answer."
+    assert result.analysis == {
+        "consensus": "All models agree X.",
+        "contradictions": ["b disagrees on Y"],
+        "blind_spots": ["none flagged Z"],
+    }
+    assert [r["model"] for r in result.responses] == PANEL
+    assert result.responses[0]["content"] == "answer from a"
+    assert result.panel == PANEL
+    assert result.judge == JUDGE
+    assert result.cost == 1.23
+    assert result.raw == payload
+
+
+def test_request_body_uses_required_tool_choice_and_panel(monkeypatch):
+    payload = {
+        "status": "ok",
+        "answer": "ok",
+        "analysis": {"consensus": "c", "contradictions": [], "blind_spots": []},
+        "responses": [{"model": "model-a", "content": "x"}],
+    }
+    captured: dict[str, Any] = {}
+    _patch_post(monkeypatch, payload, captured)
+
+    _make_executor().run("q", PANEL, api_keys=API_KEYS)
+
+    body = captured["body"]
+    assert body["tool_choice"] == "required"
+    assert body["messages"] == [{"role": "user", "content": "q"}]
+
+    tool = body["tools"][0]
+    assert tool["type"] == "openrouter:fusion"
+    assert tool["parameters"]["analysis_models"] == PANEL
+    assert tool["parameters"]["model"] == JUDGE
+
+    # The Authorization header carries the key but the body never does.
+    assert captured["headers"]["Authorization"] == "Bearer sk-test-key"
+    assert "sk-test-key" not in json.dumps(body)
+
+
+def test_judge_failure_falls_back_without_crashing(monkeypatch):
+    # status "ok" but analysis omitted -> synthesize from responses[].
+    payload = {
+        "status": "ok",
+        "responses": [
+            {"model": "model-a", "content": "partial a"},
+            {"model": "model-b", "content": "partial b"},
+        ],
+    }
+    captured: dict[str, Any] = {}
+    _patch_post(monkeypatch, payload, captured)
+
+    result = _make_executor().run("q", PANEL, api_keys=API_KEYS)
+
+    assert result.analysis is None
+    assert result.answer == "partial a\n\npartial b"
+    assert [r["model"] for r in result.responses] == ["model-a", "model-b"]
+
+
+def test_project_cost_is_per_query_dollars():
+    """project_cost returns an estimated per-query DOLLAR cost, not a unit-price proxy.
+
+    Prices in LLM_DATA are per-million-token. For each member,
+    (input_price*prompt_tokens + output_price*completion_tokens)/1e6, with
+    prompt_tokens estimated from the query (max(1, len(query)//4)) and
+    completion_tokens = est_completion_tokens (default 512).
+    """
+    executor = _make_executor()
+    query = "x" * 400  # 400 chars -> ~100 prompt tokens
+    projected = executor.project_cost(PANEL, JUDGE, query=query)
+
+    prompt_toks = max(1, len(query) // 4)  # 100
+    completion_toks = 512
+    expected = 0.0
+    for name in PANEL + [JUDGE]:
+        info = LLM_DATA[name]
+        expected += (info["input_price"] * prompt_toks + info["output_price"] * completion_toks) / 1e6
+
+    assert projected == pytest.approx(expected)
+    # Dollar-scale: a realistic per-query cost is well under a dollar here.
+    assert 0.0 < projected < 0.01
+
+
+def test_cost_ceiling_aborts_before_http_call(monkeypatch):
+    # Sentinel post that fails the test if the network layer is reached.
+    def boom(*args, **kwargs):
+        raise AssertionError("HTTP call must not happen when cost ceiling exceeded")
+
+    import requests
+
+    monkeypatch.setattr(requests, "post", boom)
+
+    # A realistic per-query DOLLAR projection (~$0.0015 for this panel+judge) must
+    # trip a tight dollar ceiling. The ceiling is now interpreted as dollars/query.
+    executor = _make_executor(cost_ceiling=0.0005)
+
+    with pytest.raises(CostCeilingExceeded) as exc:
+        executor.run("q", PANEL, judge=JUDGE, api_keys=API_KEYS)
+
+    assert exc.value.ceiling == 0.0005
+    assert exc.value.projected > 0.0005
+    # Sanity: the projection is dollar-scale, not the old unit-price-sum proxy (~5.8).
+    assert exc.value.projected < 0.01
+
+
+def test_realistic_cost_ceiling_allows_when_under_cap(monkeypatch):
+    """A realistic $0.05/query ceiling does NOT abort this cheap panel."""
+    payload = {
+        "status": "ok",
+        "answer": "ok",
+        "analysis": {"consensus": "c", "contradictions": [], "blind_spots": []},
+        "responses": [{"model": "model-a", "content": "x"}],
+    }
+    captured: dict[str, Any] = {}
+    _patch_post(monkeypatch, payload, captured)
+
+    executor = _make_executor(cost_ceiling=0.05)
+    result = executor.run("q", PANEL, judge=JUDGE, api_keys=API_KEYS)
+    assert result.answer == "ok"
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))
diff --git a/custom_routers/fusion_gate/tests/test_fusion_log.py b/custom_routers/fusion_gate/tests/test_fusion_log.py
new file mode 100644
index 0000000..24d9359
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_fusion_log.py
@@ -0,0 +1,209 @@
+"""Offline tests for the fusion log sink (UMB-125).
+
+Fully offline: no network, no torch, no real home directory. All writes go to a
+per-test temp directory. The target modules are loaded by file path (not via the
+package ``__init__``, which imports torch through router.py), so this suite runs
+in a torch-free environment. Run with either:
+
+    python -m pytest custom_routers/fusion_gate/tests/test_fusion_log.py
+    python custom_routers/fusion_gate/tests/test_fusion_log.py
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+# Load the target modules directly by file path so these tests stay offline and
+# free of the package __init__ (which imports torch via router.py). Both modules
+# are torch-free. Each is registered in sys.modules before execution so the
+# relative ``from .executor import FusionResult`` inside fusion_log.py — and the
+# dataclass field types in executor.py — resolve. This mirrors test_executor.py.
+_PLUGIN_DIR = Path(__file__).resolve().parents[1]
+
+
+def _load_module(name: str, filename: str):
+    spec = importlib.util.spec_from_file_location(name, os.path.join(str(_PLUGIN_DIR), filename))
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+# executor must be registered under the package-relative name that fusion_log's
+# ``from .executor import FusionResult`` resolves to, so the dataclass identities
+# match. fusion_log.py is loaded as a package submodule (package=fusion_gate_pkg)
+# whose .executor points at the executor we just loaded.
+_pkg = type(sys)("fusion_gate_pkg")
+_pkg.__path__ = [str(_PLUGIN_DIR)]
+sys.modules["fusion_gate_pkg"] = _pkg
+
+_executor = _load_module("fusion_gate_pkg.executor", "executor.py")
+_fusion_log = _load_module("fusion_gate_pkg.fusion_log", "fusion_log.py")
+
+FusionResult = _executor.FusionResult
+DEFAULT_SINK_PATH = _fusion_log.DEFAULT_SINK_PATH
+log_fusion = _fusion_log.log_fusion
+to_training_rows = _fusion_log.to_training_rows
+
+
+def _sample_result() -> FusionResult:
+    """A representative fusion result with a 3-model panel and a judge."""
+    return FusionResult(
+        answer="Synthesized final answer.",
+        analysis={
+            "consensus": "All three models agree on the core claim.",
+            "contradictions": [],
+            "blind_spots": ["edge case X"],
+        },
+        responses=[
+            {"model": "qwen2.5-7b-instruct", "content": "Answer from qwen."},
+            {"model": "llama-3.1-8b-instruct", "content": "Answer from llama."},
+            {"model": "mistral-7b-instruct-v0.3", "content": "Answer from mistral."},
+        ],
+        panel=[
+            "qwen2.5-7b-instruct",
+            "llama-3.1-8b-instruct",
+            "mistral-7b-instruct-v0.3",
+        ],
+        judge="llama3-70b-instruct",
+        cost=0.0042,
+        raw={"id": "gen-123", "authorization": "Bearer sk-secretkey-should-never-leak"},
+    )
+
+
+class TestLogFusion(unittest.TestCase):
+    def setUp(self) -> None:
+        self._tmp = tempfile.TemporaryDirectory()
+        self.tmp_path = Path(self._tmp.name)
+        self.sink = self.tmp_path / "nested" / "fusion_log.jsonl"
+        self.query = "Explain the tradeoffs of consensus algorithms."
+
+    def tearDown(self) -> None:
+        self._tmp.cleanup()
+
+    def test_entry_appended(self) -> None:
+        """One call writes exactly one JSONL line; a second call appends."""
+        result = _sample_result()
+
+        returned = log_fusion(result, self.query, sink_path=str(self.sink), token=512, cost=0.0042)
+        self.assertEqual(returned, self.sink)
+        self.assertTrue(self.sink.exists())
+
+        lines = self.sink.read_text(encoding="utf-8").splitlines()
+        self.assertEqual(len(lines), 1)
+
+        log_fusion(result, self.query, sink_path=str(self.sink), token=256, cost=0.002)
+        lines = self.sink.read_text(encoding="utf-8").splitlines()
+        self.assertEqual(len(lines), 2)
+
+    def test_entry_required_fields(self) -> None:
+        """Logged entry carries the full structured schema."""
+        log_fusion(_sample_result(), self.query, sink_path=str(self.sink), token=512, cost=0.0042)
+        entry = json.loads(self.sink.read_text(encoding="utf-8").splitlines()[0])
+
+        for field in ("ts", "strategy", "query", "panel", "judge", "responses", "analysis", "token", "cost"):
+            self.assertIn(field, entry, f"missing field: {field}")
+
+        self.assertEqual(entry["strategy"], "fusion")
+        self.assertEqual(entry["query"], self.query)
+        self.assertEqual(entry["judge"], "llama3-70b-instruct")
+        self.assertEqual(entry["token"], 512)
+        self.assertEqual(entry["cost"], 0.0042)
+        self.assertEqual(len(entry["responses"]), 3)
+        self.assertEqual(len(entry["panel"]), 3)
+        self.assertIn("consensus", entry["analysis"])
+
+    def test_cost_falls_back_to_result(self) -> None:
+        """When cost arg is None, the entry uses result.cost."""
+        log_fusion(_sample_result(), self.query, sink_path=str(self.sink))
+        entry = json.loads(self.sink.read_text(encoding="utf-8").splitlines()[0])
+        self.assertEqual(entry["cost"], 0.0042)
+        self.assertIsNone(entry["token"])
+
+    def test_no_key_leakage(self) -> None:
+        """Raw provider payload and credential shapes never reach the file."""
+        log_fusion(_sample_result(), self.query, sink_path=str(self.sink), token=1)
+        text = self.sink.read_text(encoding="utf-8")
+
+        self.assertNotIn("sk-secretkey-should-never-leak", text)
+        self.assertNotIn("authorization", text.lower())
+        self.assertNotIn("bearer", text.lower())
+        self.assertNotIn("gen-123", text)  # raw payload is dropped entirely
+
+        entry = json.loads(text.splitlines()[0])
+        self.assertNotIn("raw", entry)
+
+    def test_inline_secret_in_response_redacted(self) -> None:
+        """Inline credential shapes inside response content are redacted."""
+        result = FusionResult(
+            responses=[
+                {"model": "qwen2.5-7b-instruct", "content": "Use key sk-abcdef0123456789 to call it."},
+            ],
+            panel=["qwen2.5-7b-instruct"],
+            judge=None,
+        )
+        log_fusion(result, self.query, sink_path=str(self.sink))
+        text = self.sink.read_text(encoding="utf-8")
+        self.assertNotIn("sk-abcdef0123456789", text)
+        self.assertIn("[REDACTED]", text)
+
+    def test_default_sink_path_uses_llmrouter_home(self) -> None:
+        """The documented default lands under ~/.llmrouter (not asserted to disk)."""
+        self.assertTrue(DEFAULT_SINK_PATH.endswith("openclaw_memory.jsonl"))
+        self.assertIn(".llmrouter", DEFAULT_SINK_PATH)
+
+
+class TestToTrainingRows(unittest.TestCase):
+    def setUp(self) -> None:
+        self.query = "Explain the tradeoffs of consensus algorithms."
+
+    def test_responses_decompose_to_n_rows(self) -> None:
+        """N panel responses produce N per-model rows."""
+        result = _sample_result()
+        rows = to_training_rows(result, self.query)
+        self.assertEqual(len(rows), len(result.responses))
+
+        models = [r["model_name"] for r in rows]
+        self.assertEqual(
+            models,
+            ["qwen2.5-7b-instruct", "llama-3.1-8b-instruct", "mistral-7b-instruct-v0.3"],
+        )
+
+    def test_row_required_schema_fields(self) -> None:
+        """Each row carries the FusionFactory-consumable schema."""
+        rows = to_training_rows(_sample_result(), self.query)
+        for row in rows:
+            for field in ("query", "model_name", "model", "response", "performance"):
+                self.assertIn(field, row, f"missing field: {field}")
+            self.assertEqual(row["query"], self.query)
+            self.assertEqual(row["model_name"], row["model"])
+            self.assertIsNone(row["performance"])
+
+    def test_rows_no_key_leakage(self) -> None:
+        """Inline secrets in response content are scrubbed in training rows."""
+        result = FusionResult(
+            responses=[
+                {"model": "qwen2.5-7b-instruct", "content": "token Bearer sk-leak0123456789abc here"},
+            ],
+            panel=["qwen2.5-7b-instruct"],
+        )
+        rows = to_training_rows(result, self.query)
+        blob = json.dumps(rows)
+        self.assertNotIn("sk-leak0123456789abc", blob)
+        self.assertIn("[REDACTED]", blob)
+
+    def test_empty_responses_yield_no_rows(self) -> None:
+        """No panel responses -> no rows (fail-safe, not an error)."""
+        result = FusionResult(responses=[], panel=[])
+        self.assertEqual(to_training_rows(result, self.query), [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/custom_routers/fusion_gate/tests/test_gate.py b/custom_routers/fusion_gate/tests/test_gate.py
new file mode 100644
index 0000000..9e573fa
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_gate.py
@@ -0,0 +1,202 @@
+"""Offline unit tests for ``RouteGate`` (UMB-119).
+
+These tests run fully offline with no network, no torch, and no trained model:
+the lexical fallback is exercised directly, and the injected-estimator path is
+driven by a plain Python stub. Compatible with pytest (``pytest test_gate.py``)
+and also runnable standalone (``python test_gate.py``) since pytest is not a
+hard dependency of this repo.
+
+Coverage:
+  - single tier for an easy / high-confidence query
+  - fusion tier for a hard / low-confidence query
+  - high_stakes override forces fusion regardless of difficulty
+  - the threshold is config-driven (passing a different threshold flips the tier)
+  - the cheapest model is selected on the single path
+  - the injected estimator overrides the lexical heuristic when an embedding is present
+  - confidence rises with distance from the threshold
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import os
+import sys
+
+# Load gate.py by file path rather than importing the package. The package
+# __init__ pulls in router.py, which depends on torch; the gate has no such
+# dependency, so loading the module directly keeps these tests torch-free and
+# fully offline. The module is registered in sys.modules before execution so
+# dataclass field-annotation resolution (PEP 563) can find its namespace.
+_GATE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "gate.py"))
+_spec = importlib.util.spec_from_file_location("fusion_gate_gate", _GATE_PATH)
+_gate_mod = importlib.util.module_from_spec(_spec)
+sys.modules[_spec.name] = _gate_mod
+_spec.loader.exec_module(_gate_mod)
+
+GateDecision = _gate_mod.GateDecision
+RouteGate = _gate_mod.RouteGate
+
+# Minimal candidate set mirroring default_llm.json shape (name -> prices).
+LLM_DATA = {
+    "cheap-7b": {"input_price": 0.20, "output_price": 0.20},
+    "mid-49b": {"input_price": 0.90, "output_price": 0.90},
+    "big-141b": {"input_price": 1.20, "output_price": 1.20},
+}
+
+
+def _gate(threshold: float = 0.5, estimator=None) -> RouteGate:
+    return RouteGate(llm_data=LLM_DATA, threshold=threshold, estimator=estimator)
+
+
+# --------------------------------------------------------------------- tiers
+
+
+def test_easy_query_routes_single_high_confidence():
+    """A short, simple question stays on the cheap single path with high confidence."""
+    gate = _gate(threshold=0.5)
+    decision = gate.decide({"query": "What is the capital of France?"})
+
+    assert isinstance(decision, GateDecision)
+    assert decision.tier == "single"
+    assert decision.model_name == "cheap-7b"  # cheapest capable model
+    assert decision.panel == []
+    assert decision.difficulty < 0.5
+    assert decision.confidence > 0.5  # clearly below threshold => confident
+
+
+def test_hard_query_routes_fusion_low_confidence_near_threshold():
+    """A long, multi-part code/math query escalates to fusion."""
+    gate = _gate(threshold=0.5)
+    query = (
+        "Write a function to compute the integral of a matrix, then prove its "
+        "complexity, and also debug this regex; how do these interact with the "
+        "algorithm above and what is the derivative? " * 2
+    )
+    decision = gate.decide({"query": query})
+
+    assert decision.tier == "fusion"
+    assert decision.model_name is None  # single-only model_name; None for fusion
+    assert decision.difficulty >= 0.5
+
+
+def test_high_stakes_forces_fusion_even_for_easy_query():
+    """high_stakes overrides difficulty and forces fusion at full confidence."""
+    gate = _gate(threshold=0.5)
+    decision = gate.decide({"query": "Hi", "high_stakes": True})
+
+    assert decision.tier == "fusion"
+    assert decision.model_name is None
+    assert decision.confidence == 1.0  # caller override => fully confident in fusing
+
+
+def test_threshold_is_config_driven():
+    """The same query flips tier purely based on the injected threshold."""
+    query = {"query": "Explain how a hash map handles collisions and resizing."}
+
+    # Difficulty for this query is some fixed value d in (0, 1). Compute it once.
+    d = _gate()._lexical_difficulty(query["query"])
+    assert 0.0 < d < 1.0  # guard: must straddle for the test to be meaningful
+
+    lenient = _gate(threshold=d + 0.1).decide(query)  # threshold above d => single
+    strict = _gate(threshold=d - 0.1).decide(query)    # threshold below d => fusion
+
+    assert lenient.tier == "single"
+    assert strict.tier == "fusion"
+
+
+# ------------------------------------------------------------ model selection
+
+
+def test_single_path_selects_cheapest_model():
+    """Single path always returns the lowest-cost candidate."""
+    gate = _gate(threshold=0.99)  # force single for almost anything
+    decision = gate.decide({"query": "easy"})
+    assert decision.tier == "single"
+    assert decision.model_name == "cheap-7b"
+
+
+# ------------------------------------------------------------- estimator path
+
+
+def test_injected_estimator_overrides_lexical_heuristic():
+    """When an embedding + estimator are provided, the estimator decides difficulty."""
+    # Estimator ignores the embedding and returns a fixed hard score.
+    hard_estimator = lambda _embedding: 0.95  # noqa: E731
+    gate = _gate(threshold=0.5, estimator=hard_estimator)
+
+    # Query text alone would be "easy"; the estimator should override it.
+    decision = gate.decide({"query": "easy", "embedding": [0.0, 0.0, 0.0]})
+    assert decision.tier == "fusion"
+    assert abs(decision.difficulty - 0.95) < 1e-9
+
+
+def test_estimator_output_with_item_is_coerced():
+    """A tensor-like return (exposing .item()) is coerced to a float scalar."""
+
+    class _ScalarLike:
+        def __init__(self, value: float):
+            self._value = value
+
+        def item(self) -> float:
+            return self._value
+
+    gate = _gate(threshold=0.5, estimator=lambda _e: _ScalarLike(0.10))
+    decision = gate.decide({"query": "anything", "embedding": [1.0]})
+    assert decision.tier == "single"
+    assert abs(decision.difficulty - 0.10) < 1e-9
+
+
+def test_estimator_ignored_without_embedding():
+    """No embedding => lexical fallback runs even if an estimator is wired in."""
+    gate = _gate(threshold=0.5, estimator=lambda _e: 0.99)
+    decision = gate.decide({"query": "What is 2 plus 2?"})  # no embedding key
+    # Falls back to lexical heuristic (easy) rather than the estimator's 0.99.
+    assert decision.tier == "single"
+    assert decision.difficulty < 0.5
+
+
+# ----------------------------------------------------------------- confidence
+
+
+def test_confidence_increases_with_distance_from_threshold():
+    """Confidence is monotonic in the absolute margin to the threshold."""
+    gate = _gate(threshold=0.5)
+    near = gate._confidence(0.5)   # at the boundary
+    mid = gate._confidence(0.7)
+    far = gate._confidence(1.0)
+    assert near < mid < far
+    assert near == 0.0
+    assert far == 1.0
+
+
+def test_lexical_difficulty_is_deterministic_and_bounded():
+    """The heuristic is pure and always returns a value in [0, 1]."""
+    gate = _gate()
+    samples = ["", "hi", "Explain the algorithm.", "a" * 5000, "1. x? 2. y? and z?"]
+    for text in samples:
+        d = gate._lexical_difficulty(text)
+        assert 0.0 <= d <= 1.0
+        assert gate._lexical_difficulty(text) == d  # deterministic
+
+
+# ----------------------------------------------------------------- runner
+
+def _run_all() -> int:
+    tests = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
+    failures = 0
+    for test in tests:
+        try:
+            test()
+            print(f"PASS {test.__name__}")
+        except AssertionError as exc:  # pragma: no cover - reporting path
+            failures += 1
+            print(f"FAIL {test.__name__}: {exc}")
+        except Exception as exc:  # pragma: no cover - reporting path
+            failures += 1
+            print(f"ERROR {test.__name__}: {type(exc).__name__}: {exc}")
+    print(f"\n{len(tests) - failures}/{len(tests)} passed")
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(_run_all())
diff --git a/custom_routers/fusion_gate/tests/test_router.py b/custom_routers/fusion_gate/tests/test_router.py
new file mode 100644
index 0000000..6de9113
--- /dev/null
+++ b/custom_routers/fusion_gate/tests/test_router.py
@@ -0,0 +1,380 @@
+"""Offline integration tests for ``FusionGateRouter`` (UMB-121/123/124).
+
+These exercise the router end-to-end through a temp YAML config and a tiny
+in-memory candidate file. Fully offline:
+
+  * no large data files — the temp config references only ``llm_data`` (no
+    routing_data), so MetaRouter's DataLoader loads nothing heavy;
+  * no network — the only fusion-execution test monkeypatches ``requests.post``;
+  * ``--route-only`` / routing paths make NO HTTP call and spend nothing.
+
+Coverage:
+  - all six config keys are read and respected (threshold, k, judge,
+    provider/base_url, panel_preset, cost_ceiling)
+  - cost_ceiling downgrades fusion -> single (abort) with no spend
+  - route_single is spend-free (the route-only contract): a decision dict, no
+    executor.run invocation
+  - panel varies by query type
+  - all three tiers (single / budget_fusion / fusion) are reachable
+  - fuse() logs every call via fusion_log, scrubbed and raw-payload-free
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+# Make the repo importable as a package root so ``custom_routers`` resolves.
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+# router.py imports torch eagerly (MetaRouter subclasses nn.Module). When torch
+# is absent this module must SKIP cleanly rather than fail collection — otherwise
+# it interrupts the whole suite and the four torch-free modules never run. See
+# tests/conftest.py and fusion_gate/__init__.py for the torch-free design.
+pytest.importorskip("torch")
+
+from custom_routers.fusion_gate.router import FusionGateRouter  # noqa: E402
+
+# Tiny candidate set (default_llm.json shape). Distinct prices/sizes so panel
+# ordering and cost projection are deterministic.
+LLM_DATA: dict[str, dict[str, Any]] = {
+    "cheap-7b": {
+        "size": "7B",
+        "feature": "fast and efficient small model",
+        "input_price": 0.20,
+        "output_price": 0.20,
+        "model": "vendor/cheap-7b",
+        "service": "OpenRouter",
+    },
+    "mid-49b": {
+        "size": "49B",
+        "feature": "powerful high-accuracy model for complex tasks",
+        "input_price": 0.90,
+        "output_price": 0.90,
+        "model": "vendor/mid-49b",
+        "service": "OpenRouter",
+    },
+    "big-141b": {
+        "size": "141B",
+        "feature": "advanced large-scale model with exceptional performance",
+        "input_price": 1.20,
+        "output_price": 1.20,
+        "model": "vendor/big-141b",
+        "service": "OpenRouter",
+    },
+}
+
+
+def _write_config(
+    tmp_path: Path,
+    *,
+    threshold: float = 0.5,
+    budget_threshold: float | None = 0.3,
+    k: int = 2,
+    judge: str | None = None,
+    panel_preset: str = "Quality",
+    cost_ceiling: float | None = None,
+    log_sink_path: str | None = None,
+) -> str:
+    """Write a tiny llm_data JSON + a router YAML; return the YAML path."""
+    llm_path = tmp_path / "llm.json"
+    llm_path.write_text(json.dumps(LLM_DATA), encoding="utf-8")
+
+    hparam_lines = [
+        f"  threshold: {threshold}",
+        f"  k: {k}",
+        f"  judge: {('null' if judge is None else judge)}",
+        f"  panel_preset: '{panel_preset}'",
+        f"  cost_ceiling: {('null' if cost_ceiling is None else cost_ceiling)}",
+        "  provider: 'OpenRouter'",
+        "  base_url: 'https://openrouter.ai/api/v1'",
+        f"  budget_threshold: {('null' if budget_threshold is None else budget_threshold)}",
+    ]
+    if log_sink_path is not None:
+        hparam_lines.append(f"  log_sink_path: '{log_sink_path}'")
+
+    yaml_text = (
+        "data_path:\n"
+        f"  llm_data: '{llm_path}'\n"
+        "hparam:\n" + "\n".join(hparam_lines) + "\n"
+        "api_endpoint: 'https://openrouter.ai/api/v1'\n"
+    )
+    cfg_path = tmp_path / "config.yaml"
+    cfg_path.write_text(yaml_text, encoding="utf-8")
+    return str(cfg_path)
+
+
+def _router(tmp_path: Path, **kwargs) -> FusionGateRouter:
+    return FusionGateRouter(_write_config(tmp_path, **kwargs))
+
+
+# ----------------------------------------------------- config keys (UMB-121)
+
+
+def test_all_six_config_keys_are_read(tmp_path):
+    r = _router(
+        tmp_path,
+        threshold=0.42,
+        k=3,
+        judge="big-141b",
+        panel_preset="Budget",
+        cost_ceiling=7.5,
+    )
+    assert r.threshold == 0.42
+    assert r.k == 3
+    assert r.judge == "big-141b"
+    assert r.panel_preset == "Budget"
+    assert r.cost_ceiling == 7.5
+    # provider/base_url pair (the 6th key).
+    assert r.provider == "OpenRouter"
+    assert r.base_url == "https://openrouter.ai/api/v1"
+    # base_url is threaded into the executor's endpoint.
+    assert r.executor.api_endpoint == "https://openrouter.ai/api/v1"
+    # threshold/k/judge are threaded into the gate/executor.
+    assert r.gate.threshold == 0.42
+    assert r.executor.judge == "big-141b"
+    assert r.executor.cost_ceiling == 7.5
+
+
+# --------------------------------------------------- spend-free route-only
+
+
+def test_route_single_is_spend_free(tmp_path, monkeypatch):
+    """route_single never invokes the executor (no API call / no spend)."""
+    r = _router(tmp_path, threshold=0.0)  # force fusion for any query
+
+    def boom(*args, **kwargs):
+        raise AssertionError("routing must not call the executor / network")
+
+    monkeypatch.setattr(r.executor, "run", boom)
+
+    decision = r.route_single({"query": "Solve this hard logic puzzle step by step"})
+    assert decision["strategy"] == "fusion"
+    assert "panel" in decision and decision["panel"]
+    assert "judge" in decision
+    # Carries a model_name label for CLI route_query compatibility, but no call.
+    assert decision["model_name"] is not None
+
+
+def test_route_only_decision_shape_for_single(tmp_path):
+    r = _router(tmp_path, threshold=0.99, budget_threshold=0.98)  # force single
+    decision = r.route_single({"query": "Hi"})
+    assert decision["strategy"] == "single"
+    assert decision["tier"] == "single"
+    assert decision["model_name"] in LLM_DATA
+    assert decision["predicted_llm"] == decision["model_name"]
+
+
+# -------------------------------------------------- cost_ceiling (UMB-121)
+
+
+def test_cost_ceiling_downgrades_fusion_to_single(tmp_path):
+    """A realistic per-query DOLLAR ceiling aborts fusion -> single, no spend.
+
+    The projected cost is now an estimated dollar amount (~$0.001/query for this
+    k=2 panel), so a tight $0.0005 ceiling trips the guard. This guards against
+    the regression where the dollar projection silently no-op'd a sub-$1 ceiling.
+    """
+    r = _router(tmp_path, threshold=0.0, k=2, cost_ceiling=0.0005)
+    decision = r.route_single({"query": "Solve this hard logic puzzle step by step"})
+
+    assert decision["strategy"] == "single"
+    assert decision["tier"] == "single"
+    assert decision["downgraded_from"] in ("budget_fusion", "fusion")
+    assert decision["projected_cost"] > 0.0005
+    # Dollar-scale projection, not the old unit-price-sum proxy.
+    assert decision["projected_cost"] < 0.01
+    assert decision["model_name"] in LLM_DATA  # cheapest single fallback
+
+
+def test_cost_ceiling_allows_fusion_when_under_cap(tmp_path):
+    # A realistic $0.05/query ceiling comfortably clears this cheap k=2 panel.
+    r = _router(tmp_path, threshold=0.0, k=2, cost_ceiling=0.05)
+    decision = r.route_single({"query": "Solve this hard logic puzzle step by step"})
+    assert decision["strategy"] == "fusion"
+    assert decision["projected_cost"] <= 0.05
+
+
+# -------------------------------------------------- panel varies by query
+
+
+def test_panel_varies_by_query_type(tmp_path):
+    """The capability-scored panel changes with the query category (UMB-123).
+
+    Inject per-category routing performance so cheap-7b is best at code and
+    big-141b is best at reasoning; a code query and a reasoning query must then
+    produce different panels.
+    """
+    r = _router(tmp_path, threshold=0.0, k=2)
+    # Swap in a capability scorer backed by category-discriminating routing data.
+    from custom_routers.fusion_gate.capability import CapabilityScorer
+
+    routing_rows = [
+        {"task_name": "humaneval-code", "model_name": "cheap-7b", "performance": 0.95},
+        {"task_name": "humaneval-code", "model_name": "mid-49b", "performance": 0.30},
+        {"task_name": "humaneval-code", "model_name": "big-141b", "performance": 0.20},
+        {"task_name": "agentverse-logicgrid", "model_name": "cheap-7b", "performance": 0.10},
+        {"task_name": "agentverse-logicgrid", "model_name": "mid-49b", "performance": 0.50},
+        {"task_name": "agentverse-logicgrid", "model_name": "big-141b", "performance": 0.98},
+    ]
+    r.capability = CapabilityScorer(llm_data=LLM_DATA, routing_data=routing_rows)
+
+    code_panel = r.route_single({"query": "Write a function to fix this bug in my code"})["panel"]
+    reasoning_panel = r.route_single(
+        {"query": "Solve this logic puzzle, reason step by step"}
+    )["panel"]
+
+    assert code_panel[0] == "cheap-7b"
+    assert reasoning_panel[0] == "big-141b"
+    assert code_panel != reasoning_panel
+
+
+def test_preset_fallback_when_capability_unavailable(tmp_path):
+    """When capability scoring yields no panel, the configured preset drives it."""
+    quality = _router(tmp_path, threshold=0.0, k=2, panel_preset="Quality")
+    budget = _router(tmp_path, threshold=0.0, k=2, panel_preset="Budget")
+    # Simulate "capability data unavailable" by making select_panel return None;
+    # the router must then fall back to the configured preset (resolved by price).
+    quality.capability.select_panel = lambda query, k: None
+    budget.capability.select_panel = lambda query, k: None
+
+    q_panel = quality.route_single({"query": "general question"})["panel"]
+    b_panel = budget.route_single({"query": "general question"})["panel"]
+
+    assert q_panel[0] == "big-141b"  # Quality preset -> most capable first
+    assert b_panel[0] == "cheap-7b"  # Budget preset -> cheapest first
+    assert q_panel != b_panel
+
+
+# --------------------------------------------------- three tiers (UMB-124)
+
+
+def test_all_three_tiers_reachable(tmp_path):
+    """single / budget_fusion / fusion are all reachable via the two thresholds."""
+    r = _router(tmp_path, threshold=0.6, budget_threshold=0.2, k=2, cost_ceiling=None)
+
+    # Easy query -> low difficulty -> single.
+    easy = r.route_single({"query": "Hi"})
+    assert easy["tier"] == "single"
+
+    # Force exact difficulties through the gate to land in each band.
+    r.gate.estimator = lambda _e: 0.4  # between budget_threshold and threshold
+    mid = r.route_single({"query": "x", "embedding": [0.0]})
+    assert mid["tier"] == "budget_fusion"
+
+    r.gate.estimator = lambda _e: 0.9  # above threshold
+    hard = r.route_single({"query": "x", "embedding": [0.0]})
+    assert hard["tier"] == "fusion"
+
+
+def test_budget_tier_uses_budget_preset_on_fallback(tmp_path):
+    """Mid-difficulty (budget_fusion) falls back to the cheap Budget panel.
+
+    Even with panel_preset='Quality', the budget tier's fallback preset is Budget
+    so the cheap panel is used when capability data is unavailable (UMB-124).
+    """
+    r = _router(tmp_path, threshold=0.6, budget_threshold=0.2, k=2, panel_preset="Quality")
+    r.capability.select_panel = lambda query, k: None  # force preset fallback
+    r.gate.estimator = lambda _e: 0.4  # land in the budget_fusion band
+
+    decision = r.route_single({"query": "x", "embedding": [0.0]})
+    assert decision["tier"] == "budget_fusion"
+    assert decision["panel"][0] == "cheap-7b"  # Budget preset -> cheapest first
+
+
+# ----------------------------------------------- fuse() logs (UMB-125 wiring)
+
+
+def test_fuse_logs_every_call_scrubbed(tmp_path, monkeypatch):
+    sink = tmp_path / "fusion_log.jsonl"
+    r = _router(tmp_path, threshold=0.0, k=2, log_sink_path=str(sink))
+
+    payload = {
+        "status": "ok",
+        "answer": "Fused answer.",
+        "analysis": {"consensus": "agree", "contradictions": [], "blind_spots": []},
+        "responses": [
+            {"model": "big-141b", "content": "leaked sk-abcdefghijklmnop secret"},
+            {"model": "mid-49b", "content": "ok"},
+        ],
+        "cost": 2.0,
+    }
+
+    # Patch the executor's HTTP seam directly so the test is robust whether or
+    # not `requests` is installed (the executor falls back to urllib otherwise).
+    # No real network call is ever made.
+    monkeypatch.setattr(
+        r.executor, "_post_chat_completions", lambda body, api_key: payload
+    )
+
+    decision = r.route_single({"query": "Solve this hard logic puzzle step by step"})
+    result = r.fuse(decision, api_keys={"OpenRouter": "sk-test-key"})
+
+    assert result.answer == "Fused answer."
+    assert sink.exists()
+
+    lines = sink.read_text(encoding="utf-8").strip().splitlines()
+    assert len(lines) == 1
+    record = json.loads(lines[0])
+    assert record["strategy"] == "fusion"
+    assert record["query"].startswith("Solve this hard logic puzzle")
+    # Raw provider payload must NOT be persisted, and inline secrets scrubbed.
+    assert "raw" not in record
+    serialized = json.dumps(record)
+    assert "sk-abcdefghijklmnop" not in serialized
+    assert "sk-test-key" not in serialized
+
+
+def test_fuse_returns_result_when_log_sink_unwritable(tmp_path, monkeypatch):
+    """A logging failure must NOT destroy the already-computed FusionResult.
+
+    The sink directory is made unwritable so ``log_fusion`` raises an OSError on
+    write; ``fuse()`` must swallow it and still return the result.
+    """
+    locked_dir = tmp_path / "locked"
+    locked_dir.mkdir()
+    sink = locked_dir / "fusion_log.jsonl"
+    r = _router(tmp_path, threshold=0.0, k=2, log_sink_path=str(sink))
+
+    payload = {
+        "status": "ok",
+        "answer": "Fused answer.",
+        "analysis": {"consensus": "agree", "contradictions": [], "blind_spots": []},
+        "responses": [{"model": "big-141b", "content": "ok"}],
+        "cost": 2.0,
+    }
+    monkeypatch.setattr(
+        r.executor, "_post_chat_completions", lambda body, api_key: payload
+    )
+
+    # Revoke write/execute so creating the file under it fails with OSError.
+    os.chmod(locked_dir, 0o500)
+    try:
+        decision = r.route_single({"query": "Solve this hard logic puzzle step by step"})
+        result = r.fuse(decision, api_keys={"OpenRouter": "sk-test-key"})
+    finally:
+        # Restore perms so tmp_path cleanup can remove the tree.
+        os.chmod(locked_dir, 0o700)
+
+    # The result survives even though the log write failed.
+    assert result.answer == "Fused answer."
+    assert not sink.exists()
+
+
+def test_fuse_rejects_non_fusion_result(tmp_path):
+    r = _router(tmp_path, threshold=0.99, budget_threshold=0.98)
+    decision = r.route_single({"query": "Hi"})
+    assert decision["strategy"] == "single"
+    with pytest.raises(ValueError):
+        r.fuse(decision)
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))