From b528030567cec183158ebd5af4dc76a9be77564a Mon Sep 17 00:00:00 2001 From: ConsultingFuture4200 Date: Mon, 15 Jun 2026 11:37:31 -0700 Subject: [PATCH] feat(custom_routers): add FusionGateRouter route-vs-fuse meta-router - Gate each query between single-model routing and OpenRouter openrouter:fusion (panel + judge), with a three-tier dial: single / budget_fusion / fusion - Isolate the beta openrouter:fusion server tool behind FusionExecutor (one blast point); graceful judge-failure fallback; per-query dollar cost_ceiling - Capability-scored panel selection with Quality/Budget preset fallback - --route-only spend-free preview; 6+ config keys; secret-scrubbed fusion logging producing FusionFactory-style training rows; offline retrain step - Three-arm offline eval harness + fixtures (mock = zero spend); 42 tests - Zero core edits; one optional provider; local fan-out fallback left as follow-up --- custom_routers/fusion_gate/.gitignore | 15 + custom_routers/fusion_gate/PR_BODY.md | 98 +++ custom_routers/fusion_gate/README.md | 176 ++++ custom_routers/fusion_gate/__init__.py | 34 + custom_routers/fusion_gate/capability.py | 390 +++++++++ custom_routers/fusion_gate/config.yaml | 58 ++ custom_routers/fusion_gate/eval/RESULTS.md | 107 +++ custom_routers/fusion_gate/eval/__init__.py | 13 + .../fusion_gate/eval/eval_harness.py | 813 ++++++++++++++++++ .../eval/fixtures/hard_slice.jsonl | 16 + .../eval/fixtures/llm_candidates.json | 58 ++ .../eval/fixtures/routing_data.jsonl | 28 + custom_routers/fusion_gate/eval/retrain.py | 464 ++++++++++ custom_routers/fusion_gate/executor.py | 429 +++++++++ custom_routers/fusion_gate/fusion_log.py | 204 +++++ custom_routers/fusion_gate/gate.py | 328 +++++++ custom_routers/fusion_gate/router.py | 270 ++++++ custom_routers/fusion_gate/tests/__init__.py | 1 + custom_routers/fusion_gate/tests/conftest.py | 37 + .../fusion_gate/tests/test_capability.py | 197 +++++ .../fusion_gate/tests/test_eval_harness.py | 227 +++++ .../fusion_gate/tests/test_executor.py | 228 +++++ .../fusion_gate/tests/test_fusion_log.py | 209 +++++ custom_routers/fusion_gate/tests/test_gate.py | 202 +++++ .../fusion_gate/tests/test_router.py | 380 ++++++++ 25 files changed, 4982 insertions(+) create mode 100644 custom_routers/fusion_gate/.gitignore create mode 100644 custom_routers/fusion_gate/PR_BODY.md create mode 100644 custom_routers/fusion_gate/README.md create mode 100644 custom_routers/fusion_gate/__init__.py create mode 100644 custom_routers/fusion_gate/capability.py create mode 100644 custom_routers/fusion_gate/config.yaml create mode 100644 custom_routers/fusion_gate/eval/RESULTS.md create mode 100644 custom_routers/fusion_gate/eval/__init__.py create mode 100644 custom_routers/fusion_gate/eval/eval_harness.py create mode 100644 custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl create mode 100644 custom_routers/fusion_gate/eval/fixtures/llm_candidates.json create mode 100644 custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl create mode 100644 custom_routers/fusion_gate/eval/retrain.py create mode 100644 custom_routers/fusion_gate/executor.py create mode 100644 custom_routers/fusion_gate/fusion_log.py create mode 100644 custom_routers/fusion_gate/gate.py create mode 100644 custom_routers/fusion_gate/router.py create mode 100644 custom_routers/fusion_gate/tests/__init__.py create mode 100644 custom_routers/fusion_gate/tests/conftest.py create mode 100644 custom_routers/fusion_gate/tests/test_capability.py create mode 100644 custom_routers/fusion_gate/tests/test_eval_harness.py create mode 100644 custom_routers/fusion_gate/tests/test_executor.py create mode 100644 custom_routers/fusion_gate/tests/test_fusion_log.py create mode 100644 custom_routers/fusion_gate/tests/test_gate.py create mode 100644 custom_routers/fusion_gate/tests/test_router.py diff --git a/custom_routers/fusion_gate/.gitignore b/custom_routers/fusion_gate/.gitignore new file mode 100644 index 0000000..dcd54e0 --- /dev/null +++ b/custom_routers/fusion_gate/.gitignore @@ -0,0 +1,15 @@ +# Compiled Python artifacts must not be tracked. These are build output, not +# source, and were committed by mistake. To purge ones already tracked: +# git rm -r --cached custom_routers/fusion_gate/**/__pycache__ +__pycache__/ +*.pyc +*.pyo + +# Eval harness runtime output. The harness writes results.csv / results.md here +# on every run; this is build output, not source, and must never be tracked. The +# committed, intentional report lives at eval/RESULTS.md instead. +eval/out/ + +# The repo root .gitignore ignores *.jsonl globally. Re-include the committed +# eval fixtures, which are source (the offline --mock harness depends on them). +!eval/fixtures/*.jsonl diff --git a/custom_routers/fusion_gate/PR_BODY.md b/custom_routers/fusion_gate/PR_BODY.md new file mode 100644 index 0000000..41c1ea3 --- /dev/null +++ b/custom_routers/fusion_gate/PR_BODY.md @@ -0,0 +1,98 @@ +# Add FusionGateRouter — a route-vs-fuse meta-router + +## Summary + +Adds `FusionGateRouter`, a self-contained custom router plugin under +`custom_routers/fusion_gate/` that gates each query between the cheap +single-model path and a multi-model **fusion** path, with fusion delegated to +OpenRouter's `openrouter:fusion` server tool. **Zero edits to core `llmrouter/` +code** — the plugin is auto-discovered via the existing `custom_routers/` +mechanism, exactly like `randomrouter` and `thresholdrouter`. + +## Motivation + +LLMRouter today picks *which single model* answers a query. The interesting +lever for hard queries is a different one: **route vs. fuse** — decide whether a +query is worth running a panel of models and synthesizing their answers. This PR +makes route-vs-fuse the **primary per-query dial**, expressed as a three-tier +escalation driven by estimated difficulty: + +``` +single -> budget_fusion (cheap panel) -> fusion (full Quality panel) +``` + +Cheap queries stay cheap; only the hard ones escalate, and the middle tier lets +mid-difficulty queries fuse on a budget panel instead of jumping straight to the +full Quality panel. + +## What's included + +**In scope:** +- `FusionGateRouter` — the route-vs-fuse gate (difficulty + confidence) plus capability-scored panel selection with a Quality/Budget preset fallback. +- An `openrouter:fusion` adapter (`executor.py`) — the single, isolated blast point for the beta server-tool API. +- A configurable surface (`threshold`, `k`, `judge`, `provider`/`base_url`, `panel_preset`, `cost_ceiling`, `est_completion_tokens`) and a `--route-only` spend-free preview that returns the decision + intended panel/judge without any API call. +- A per-query **dollar** cost guard (`cost_ceiling`) that downgrades fusion → single when the projected spend exceeds the cap. +- Secret-scrubbed fusion-call logging (`fusion_log.py`) producing FusionFactory-style `(query, model, response, performance)` training rows. +- A three-arm offline eval harness + bundled fixtures (`eval/`) and an offline retrain step. +- Self-contained: **ONE optional provider** (OpenRouter), **ZERO core edits**. + +**Out of scope (follow-ups):** +- **Local fan-out fallback is OUT of this PR.** Without an OpenRouter key only `--route-only` is exercisable. The executor interface is the seam a provider-agnostic local fan-out path would slot behind later — happy to add it if maintainers want it. +- A learned gate (the gate currently uses a duck-typed difficulty estimator with a deterministic lexical fallback so it runs with no trained model). + +## Eval results + +> **All committed numbers are from MOCK fixtures** (deterministic stub executor, +> zero spend, no network). They validate harness wiring and metric math, **not** +> real model quality. **Real numbers require a keyed live run** +> (`OPENROUTER_API_KEY` / `API_KEYS` set) against a real benchmark slice — that +> path is documented but intentionally not wired into the offline harness so a +> stray run cannot spend. See `eval/RESULTS.md`. + +Dataset: 16 held-out queries (6 easy + 10 hard; GSM8K / MATH / GPQA / MBPP). +Quality / blended cost / escalation `p` are over the full 16-query dataset; **gate +precision is computed over the same fixed 10-query hard slice for every arm** so the +arms are comparable (`always_route` makes no escalation decision → N/A). Slice +definitions are documented in `eval/RESULTS.md`. Blended cost is an estimated +**per-query dollar** amount. + +| Arm | n | Quality | Blended cost ($/query) | Escalation p | Gate-precision (hard slice) | +|-----|---|---------|------------------------|--------------|------------------------------| +| always_route | 16 | 0.3750 | 0.000650 | 0.0000 | n/a | +| always_fuse | 16 | 1.0000 | 0.001137 | 1.0000 | 1.0000 | +| fusion_gate | 16 | 1.0000 | 0.000767 | 0.6250 | 1.0000 | + +- **Quality target** — gate ≥ 95% of always-fuse quality: 1.0000 vs target 0.9500 → **PASS** (mock). +- **Cost target** — blended cost ≤ 1.6× always-route: ratio 1.18 → **PASS** (mock). +- **Gate precision** — escalated answers beating best single, over the hard slice: fusion_gate 10/10, always_fuse 10/10 → **measured** (mock). +- **Retrain delta** — offline log→retrain holds gate-precision at 1.0000 (threshold refit 0.400 → 0.520, budget_threshold 0.100 → 0.180). **Real delta pending a keyed live run.** + +## FusionFactory & continual learning + +Each fusion call yields a panel of per-model responses plus a judge synthesis — +exactly the `(query, model, response, performance)` observations FusionFactory +needs. `fusion_log.to_training_rows` decomposes them into rows shaped for +`llmrouter/data/api_calling_evaluation.py`, and the retrain step replays the +logged sink to refit the gate thresholds offline. This directly serves the +repo's **continual-learning TODO**: the router's own fusion traffic becomes the +training signal that sharpens the route-vs-fuse gate over time, with no separate +labeling pass required. + +## Beta server-tool caveat + +`openrouter:fusion` is an OpenRouter **BETA** server tool; its request/response +shape may change. All OpenRouter HTTP specifics are confined to `executor.py` +(request body, tool type, key resolution, transport, payload parsing), so an +upstream beta change touches one file. The executor degrades gracefully on judge +failure (synthesizes from panel responses). No API keys, auth headers, or raw +provider payloads are ever logged. + +## Testing + +Torch-free, fully offline (HTTP mocked): + +```bash +pytest custom_routers/fusion_gate/tests/ +python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \ + --out custom_routers/fusion_gate/eval/out +``` diff --git a/custom_routers/fusion_gate/README.md b/custom_routers/fusion_gate/README.md new file mode 100644 index 0000000..d321bb7 --- /dev/null +++ b/custom_routers/fusion_gate/README.md @@ -0,0 +1,176 @@ +# FusionGateRouter + +**Type:** Meta-router (route-vs-fuse gate). No training required to run; an optional offline retrain step refits the gate from logged fusion calls. + +**Description:** A per-query gate that decides between the cheap **single-model** +path (classic LLMRouter routing) and a **fusion** path that runs a panel of +models and synthesizes their answers. Fusion is delegated to the OpenRouter +`openrouter:fusion` server tool (BETA — see the caveat below). Routing is +spend-free: the decision is computed locally and only `fuse()` ever calls the +provider. + +The primary per-query dial is **route vs. fuse**, expressed as three tiers: + +``` +difficulty < budget_threshold -> single (cheapest single model) +budget_threshold <= difficulty < threshold -> budget_fusion (cheap Budget panel) +difficulty >= threshold -> fusion (full Quality panel) +``` + +Set `budget_threshold: null` (or `>= threshold`) to disable the middle tier and +collapse to plain single/fusion. A `high_stakes: true` flag on a query forces +the full Quality `fusion` tier regardless of difficulty. + +## Usage + +```bash +# Inference (routes, then fuses via openrouter:fusion if the gate escalates) +llmrouter infer --router fusion_gate \ + --config custom_routers/fusion_gate/config.yaml \ + --query "Prove that the square root of 2 is irrational." + +# Route-only — compute the decision with ZERO spend / no network call +llmrouter infer --router fusion_gate \ + --config custom_routers/fusion_gate/config.yaml \ + --query "What is the capital of France?" \ + --route-only +``` + +`--route-only` returns the decision dict (tier, panel, judge, projected cost) +without ever calling OpenRouter. Spend happens only when `fuse()` is invoked. + +## Decision contract + +`route_single` returns one of two shapes (both carry `strategy`, `tier`, and +`model_name` for drop-in CLI compatibility): + +- **single:** `{query, strategy="single", tier="single", model_name, predicted_llm, difficulty, confidence}` +- **fusion:** `{query, strategy="fusion", tier="budget_fusion"|"fusion", panel[], judge, model_name, predicted_llm, difficulty, confidence, projected_cost}` + +When the cost guard fires, a fusion decision is **downgraded** to single and the +result carries `downgraded_from`, `projected_cost`, and `cost_ceiling`. + +## Configuration + +All keys live under `hparam:` in `config.yaml` unless noted. + +| Key | Default | Purpose | +|-----|---------|---------| +| `threshold` | `0.5` | Difficulty cutoff to escalate to the full Quality `fusion` tier. | +| `budget_threshold` | `0.3` | Lower boundary of the middle `budget_fusion` tier. `null` (or `>= threshold`) disables it. | +| `k` | `3` | Panel size — maps to the tool's `analysis_models`. | +| `judge` | `null` | Judge model slug — maps to the tool's `model`. `null` = use the outer model. | +| `panel_preset` | `Quality` | Fallback preset (`Quality` / `Budget`) when capability data is unavailable for a query. | +| `cost_ceiling` | `null` | Hard per-query **dollar** cap on the projected `Σ(panel)+judge` cost. `null` = off. See the cost-unit note. | +| `est_completion_tokens` | `512` | Per-completion output-token estimate feeding the dollar cost projection. | +| `provider` | `OpenRouter` | Informational; drives credential resolution. | +| `base_url` | `https://openrouter.ai/api/v1` | OpenRouter endpoint hosting the beta server tool. Overrides the top-level `api_endpoint`. | +| `log_sink_path` | `null` | JSONL sink for fusion-call logging. `null` = `fusion_log` default (`~/.llmrouter/openclaw_memory.jsonl`). | + +Top-level `data_path` / `metric` keys mirror the other custom routers +(`randomrouter`, `thresholdrouter`); see `config.yaml` for the loaded candidate +and routing-data paths. + +### Cost-unit note (important) + +`cost_ceiling` is compared against `project_cost`, which estimates the **per-query +dollar cost** of the panel + judge. For each member, +`(input_price · prompt_tokens + output_price · completion_tokens) / 1e6`, where +`input_price` / `output_price` are the per-million-token prices from `llm_data`, +`prompt_tokens ≈ len(query) // 4`, and `completion_tokens = est_completion_tokens` +(default `512`). Set `cost_ceiling` in **dollars per query** (e.g. `0.05` ≈ five +cents per query). + +## Panel selection + +Panels are chosen by `CapabilityScorer`, which scores candidates per **query +category** (code / math / reasoning / general) from the LLMRouter routing-data +tables, lightly cost-penalized. When no usable capability data exists for a +query's category, selection falls back to a preset panel resolved by tier: +`budget_fusion` -> `Budget`, anything else -> the configured `panel_preset` +(`Quality` by default). The tier->preset mapping (`gate.resolve_preset`) is the +single source of truth shared with the eval harness. + +## OpenRouter `openrouter:fusion` — BETA caveat + +The fusion path depends on OpenRouter's `openrouter:fusion` **server tool, which +is BETA**: its request/response shape may change without notice. To contain that +risk, **every OpenRouter HTTP specific lives in `executor.py` and nowhere else** +— request body construction, the `openrouter:fusion` tool type, key resolution, +transport, and payload parsing. An upstream beta change should touch that one +file only. The executor also tolerates judge failure (status `ok` with +`analysis` omitted): it synthesizes the answer from the panel responses rather +than crashing. + +OpenRouter is the **one optional provider**. There is no local fan-out fallback +(deferred to a follow-up); without a key, only `--route-only` is exercisable. + +## Logging + +Every `fuse()` call is appended (best-effort, append-only) to the JSONL sink via +`fusion_log.log_fusion`. The sink is **secret-scrubbed**: API keys, auth +headers, cookies, and the untouched provider payload are never written; only an +enumerated set of fields (query, panel, judge, normalized responses, analysis, +token/cost) is emitted. These rows are the FusionFactory-style training signal +consumed by the offline retrain step. + +## Offline evaluation (`--mock`, zero spend) + +The three-arm harness compares `always_route`, `always_fuse`, and `fusion_gate` +over a bundled hard-query slice (GSM8K / MATH / GPQA / MBPP). It is **offline by +default** — a deterministic stub executor reads canned answers from fixtures; no +network call is made and nothing is spent. + +```bash +# Run the offline harness (mock is the default) +python -m custom_routers.fusion_gate.eval.eval_harness --mock \ + --out custom_routers/fusion_gate/eval/out + +# Include the mock retrain (M3 before/after) delta in results.md +python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \ + --out custom_routers/fusion_gate/eval/out +``` + +Tunable flags: `--threshold` (0.5), `--budget-threshold` (0.3), `--k` (2 in the +harness — kept cost-bounded for the M2 target; the plugin config uses `k=3`), +`--judge`, `--panel-preset`, `--dataset`, `--llm`, `--routing`, `--out`. +Outputs: `/results.csv` and `/results.md` (the `--out` dir defaults to +`eval/out/`, which is **gitignored** — runtime output, not source). The committed, +intentional report lives at [`eval/RESULTS.md`](eval/RESULTS.md), which also documents +the full-dataset vs hard-slice definitions used by the metrics. + +`--live` is intentionally **not** wired into this harness, so a stray run cannot +spend; passing it errors out with a pointer to the keyed live-run path. + +Run the unit tests (torch-free, fully offline, HTTP mocked): + +```bash +pytest custom_routers/fusion_gate/tests/ +``` + +## Live run (keyed, real spend) + +The committed eval numbers are from MOCK fixtures. To produce real M1–M4 numbers +you must run keyed against real models: + +```bash +# Provide an OpenRouter key (never commit it): +export OPENROUTER_API_KEY=sk-... # or: export API_KEYS='{"OpenRouter": "sk-..."}' + +# Then build the real FusionGateRouter from config.yaml and route+fuse a real +# benchmark slice; the executor makes the openrouter:fusion calls. The offline +# eval harness does NOT make live calls by design — see eval/RESULTS.md. +``` + +Keys are resolved (in order) from an explicit `api_keys={"OpenRouter": "..."}` +dict, `OPENROUTER_API_KEY`, or an `API_KEYS` JSON env var. Keys are never logged. + +## Files + +- `router.py` — `FusionGateRouter` entry point (MetaRouter contract). +- `gate.py` — `RouteGate`, `GateDecision`, the three-tier dial, `resolve_preset`. +- `capability.py` — `CapabilityScorer` panel selection. +- `executor.py` — **the only** OpenRouter `openrouter:fusion` blast point. +- `fusion_log.py` — secret-scrubbed JSONL logging + training-row decomposition. +- `eval/` — three-arm offline harness, fixtures, retrain, and `RESULTS.md` (the committed report; `eval/out/` is gitignored runtime output). +- `tests/` — torch-free offline unit tests. diff --git a/custom_routers/fusion_gate/__init__.py b/custom_routers/fusion_gate/__init__.py new file mode 100644 index 0000000..09f5c2d --- /dev/null +++ b/custom_routers/fusion_gate/__init__.py @@ -0,0 +1,34 @@ +"""fusion_gate — route-vs-fuse meta-router plugin for LLMRouter. + +Auto-discovered from ./custom_routers/ . See router.py for the entry point. + +``FusionGateRouter`` is imported LAZILY (PEP 562 ``__getattr__``) rather than +eagerly: ``router.py`` pulls in torch (MetaRouter subclasses ``nn.Module``), and +an eager import here would force torch to load whenever this package is merely +*resolved* — which pytest does for every test module under ``tests/`` while +walking the package hierarchy. That made the four torch-free test modules +uncollectable under the standard ``pytest custom_routers/fusion_gate/tests/`` +invocation (ModuleNotFoundError: No module named 'torch'). Deferring the import +to first attribute access keeps package resolution torch-free while still +exposing ``FusionGateRouter`` as a top-level name when it is actually used. +""" + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: # import for type-checkers only; not executed at runtime + from .router import FusionGateRouter + +__all__ = ["FusionGateRouter"] + + +def __getattr__(name: str) -> Any: + """Lazily import ``FusionGateRouter`` on first access (PEP 562). + + torch (a transitive dependency of ``router.py``) is loaded only when the + router is actually requested, not at package-collection time. + """ + if name == "FusionGateRouter": + from .router import FusionGateRouter + + return FusionGateRouter + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/custom_routers/fusion_gate/capability.py b/custom_routers/fusion_gate/capability.py new file mode 100644 index 0000000..dbfefbc --- /dev/null +++ b/custom_routers/fusion_gate/capability.py @@ -0,0 +1,390 @@ +"""CapabilityScorer — capability-scored fusion panel selection (UMB-123). + +Scores each candidate model against a query and returns the top-k panel that +maps to the OpenRouter ``openrouter:fusion`` tool's ``analysis_models``. Panel +membership varies by **query type**: a code/math/reasoning query and a general +query draw on different per-category performance, so they generally produce +different panels. + +Capability source (offline, no network): + - LLMRouter per-model routing performance — the ``routing_data_*`` tables that + ``MetaRouter``'s ``DataLoader`` attaches to the router (a pandas DataFrame or + a list of row dicts). Each row carries ``task_name`` / ``model_name`` / + ``performance``; we bucket ``task_name`` into a small set of query + categories and aggregate mean performance per ``(category, model)``. + - The ``feature`` text and prices in ``default_llm.json`` provide a deterministic + secondary signal (capability prior from model size/feature wording, lightly + cost-penalized) so scoring still differentiates models for categories the + routing table does not cover. + +Fallback contract (UMB-123): when no usable capability data is available for a +query's category, ``select_panel`` returns ``None`` so the caller falls back to +the configured ``panel_preset`` (Quality / Budget). The presets are also defined +here so the router/executor share one source of truth. + +The scorer is pure data-in / list-out and imports no torch, keeping it fully +offline and unit-testable with small in-memory fixtures. +""" + +from __future__ import annotations + +import re +from typing import Any, Iterable, Literal + +QueryCategory = Literal["code", "math", "reasoning", "general"] + +# --- query-type detection (deterministic, documented) ------------------------ +# Mirrors the gate's code/math markers but resolves a single *category* label so +# panel selection can be category-specific. Order matters: the first matching +# category wins, with "general" as the catch-all. +_CODE_KEYWORDS = ( + "code", + "function", + "compile", + "debug", + "regex", + "program", + "python", + "javascript", + "bug", +) +_CODE_SYMBOLS = ("```", "def ", "class ", "{", "}", ";", "=>", "->") +_MATH_KEYWORDS = ( + "integral", + "derivative", + "theorem", + "proof", + "equation", + "matrix", + "algebra", + "calculus", + "probability", + "geometry", +) +_MATH_SYMBOLS = ("∫", "∑", "√", "^", "\\") +_REASONING_KEYWORDS = ( + "algorithm", + "complexity", + "reason", + "logic", + "deduce", + "puzzle", + "explain why", + "step by step", + "strategy", + "plan", +) + +# Mapping from routing-data ``task_name`` substrings to a query category. The +# example routing data uses task names like ``agentverse-logicgrid``; this lets +# the per-category aggregation align with the query-type detector above. +_TASK_CATEGORY_PATTERNS: tuple[tuple[str, QueryCategory], ...] = ( + ("logic", "reasoning"), + ("reason", "reasoning"), + ("grid", "reasoning"), + ("puzzle", "reasoning"), + ("math", "math"), + ("gsm", "math"), + ("arithmetic", "math"), + ("algebra", "math"), + ("code", "code"), + ("humaneval", "code"), + ("mbpp", "code"), + ("program", "code"), +) + +# Built-in presets used as the fallback panel when capability data is missing. +# These are *labels*, not model names: the scorer resolves them against the +# candidate set by price (cheapest-N for Budget, most-capable-N for Quality). +PRESET_QUALITY = "Quality" +PRESET_BUDGET = "Budget" + + +class CapabilityScorer: + """Score candidate models per query and pick a top-k fusion panel. + + Args: + llm_data: name -> candidate-metadata mapping (from default_llm.json), + carrying ``feature`` text and ``input_price`` / ``output_price``. + routing_data: optional per-model performance source — a pandas DataFrame + or an iterable of row dicts with ``task_name`` / ``model_name`` / + ``performance`` keys. When ``None`` or empty, capability scoring + falls back to the static prior derived from ``llm_data``. + """ + + def __init__( + self, + llm_data: dict[str, Any], + routing_data: Any = None, + ): + self.llm_data = llm_data + self.llm_names = list(llm_data.keys()) + # category -> {model_name -> mean performance in roughly [0, 1]} + self._perf_by_category: dict[QueryCategory, dict[str, float]] = ( + self._aggregate_performance(routing_data) + ) + + # ----------------------------------------------------------- public API + + def select_panel(self, query: str, k: int) -> list[str] | None: + """Return the capability-scored top-k panel for ``query``. + + The query is classified into a category (code/math/reasoning/general); + models are scored for that category and the top ``k`` by score are + returned. Returns ``None`` when no usable capability data exists for the + category, signalling the caller to fall back to ``panel_preset``. + + Args: + query: Raw query text. + k: Panel size (maps to the fusion tool's ``analysis_models`` length). + + Returns: + A list of up to ``k`` candidate model names, or ``None`` to trigger + the preset fallback. + """ + if k <= 0 or not self.llm_names: + return None + + category = self.classify_query(query) + scores = self._score_models(category) + if scores is None: + return None + + ranked = sorted( + self.llm_names, + key=lambda name: (scores.get(name, 0.0), name), + reverse=True, + ) + return ranked[:k] + + def preset_panel(self, preset: str, k: int) -> list[str]: + """Resolve a named preset (Quality / Budget) to a top-k panel. + + Quality => the ``k`` most-capable candidates (price-as-capability proxy, + descending). Budget => the ``k`` cheapest candidates. Any unrecognized + preset is treated as Quality. Used as the fallback when capability data + is unavailable. + """ + if k <= 0 or not self.llm_names: + return [] + + by_price_desc = sorted( + self.llm_names, key=lambda name: (self._price(name), name), reverse=True + ) + if str(preset).lower() == PRESET_BUDGET.lower(): + cheapest = sorted(self.llm_names, key=lambda name: (self._price(name), name)) + return cheapest[:k] + return by_price_desc[:k] + + def classify_query(self, query: str) -> QueryCategory: + """Classify a query into a coarse capability category. + + Deterministic precedence: code, then math, then reasoning, else general. + Kept pure (text in, label out) for unit testing. + """ + if not query: + return "general" + lowered = query.lower() + + if self._matches(lowered, query, _CODE_KEYWORDS, _CODE_SYMBOLS): + return "code" + if self._matches(lowered, query, _MATH_KEYWORDS, _MATH_SYMBOLS): + return "math" + if any(keyword in lowered for keyword in _REASONING_KEYWORDS): + return "reasoning" + return "general" + + # ----------------------------------------------------------- scoring + + def _score_models(self, category: QueryCategory) -> dict[str, float] | None: + """Build a per-model score map for a category, or ``None`` if unusable. + + Combines two signals: + 1. Empirical per-category performance from the routing data (primary). + 2. A static prior from ``llm_data`` (feature/size wording, lightly + cost-penalized) so models absent from the routing table for this + category still rank relative to one another. + + Returns ``None`` only when *neither* signal yields any differentiation + (no routing data for the category AND no llm_data prior), which is the + fallback trigger for ``select_panel``. + """ + empirical = self._perf_by_category.get(category, {}) + prior = self._static_prior() + + if not empirical and not prior: + return None + + scores: dict[str, float] = {} + for name in self.llm_names: + emp = empirical.get(name) + pri = prior.get(name, 0.0) + if emp is not None: + # Empirical performance dominates; the prior breaks ties and + # ranks models the routing table did not cover for this category. + scores[name] = 0.8 * emp + 0.2 * pri + else: + scores[name] = pri + return scores + + def _static_prior(self) -> dict[str, float]: + """Capability prior in [0, 1] from llm_data feature text and price. + + Heuristic and deterministic: larger / more-capable wording and higher + price correlate with capability in the candidate set, but cost is lightly + penalized so two models with similar capability favor the cheaper one. + Returns an empty map when ``llm_data`` is empty. + """ + if not self.llm_names: + return {} + + prices = [self._price(name) for name in self.llm_names] + max_price = max(prices) if prices else 0.0 + + prior: dict[str, float] = {} + for name in self.llm_names: + info = self.llm_data.get(name, {}) + capability = self._feature_capability(info) + price = self._price(name) + # Normalize price to [0, 1]; subtract a small cost penalty. + norm_price = (price / max_price) if max_price > 0 else 0.0 + prior[name] = self._clamp(capability - 0.1 * norm_price) + return prior + + def _feature_capability(self, info: dict[str, Any]) -> float: + """Estimate capability in [0, 1] from a candidate's size/feature text. + + Uses the model ``size`` (parameter count) when parseable, else falls + back to capability-suggestive wording in the ``feature`` blurb. Both are + normalized into [0, 1]; deterministic and offline. + """ + size_score = self._size_score(info.get("size")) + if size_score is not None: + return size_score + + feature = str(info.get("feature", "")).lower() + strong_markers = ( + "powerful", + "high-accuracy", + "exceptional", + "advanced", + "complex", + "large-scale", + ) + hits = sum(1 for marker in strong_markers if marker in feature) + return self._clamp(hits / 3.0) + + @staticmethod + def _size_score(size: Any) -> float | None: + """Parse a parameter-count string (e.g. ``"49B"``) into a [0, 1] score. + + Normalized by a 200B saturation point so the example candidate set + (7B..141B) spreads across the range. Returns ``None`` when unparseable. + """ + if size is None: + return None + match = re.match(r"\s*([\d.]+)\s*([bBmM]?)", str(size)) + if not match: + return None + try: + value = float(match.group(1)) + except ValueError: + return None + unit = match.group(2).lower() + billions = value / 1000.0 if unit == "m" else value + return CapabilityScorer._clamp(billions / 200.0) + + # ----------------------------------------------------------- aggregation + + def _aggregate_performance( + self, routing_data: Any + ) -> dict[QueryCategory, dict[str, float]]: + """Aggregate mean performance per (category, model) from routing data. + + Accepts a pandas DataFrame or an iterable of row dicts. Rows missing the + required keys are skipped. ``task_name`` is bucketed into a query + category; ``performance`` values are averaged per (category, model). + Returns an empty mapping when no usable rows are present. + """ + rows = self._iter_rows(routing_data) + # category -> model -> [running_sum, count] + accum: dict[QueryCategory, dict[str, list[float]]] = {} + + for row in rows: + model = row.get("model_name") + perf = row.get("performance") + task = row.get("task_name") + if model is None or perf is None: + continue + try: + perf_value = float(perf) + except (TypeError, ValueError): + continue + category = self._task_to_category(task) + bucket = accum.setdefault(category, {}) + entry = bucket.setdefault(str(model), [0.0, 0.0]) + entry[0] += perf_value + entry[1] += 1.0 + + result: dict[QueryCategory, dict[str, float]] = {} + for category, models in accum.items(): + result[category] = { + name: (total / count) if count else 0.0 + for name, (total, count) in models.items() + } + return result + + @staticmethod + def _iter_rows(routing_data: Any) -> Iterable[dict[str, Any]]: + """Yield row dicts from a DataFrame or an iterable of dicts. + + DataFrames are detected by duck-typing ``to_dict`` (pandas) so this + module never imports pandas. Anything else is treated as an iterable of + mapping-like rows; non-mappings are ignored. + """ + if routing_data is None: + return [] + # pandas DataFrame: convert to list-of-dicts without importing pandas. + if hasattr(routing_data, "to_dict"): + try: + return routing_data.to_dict(orient="records") + except TypeError: + return [] + if isinstance(routing_data, dict): + return [] + try: + return [row for row in routing_data if isinstance(row, dict)] + except TypeError: + return [] + + @staticmethod + def _task_to_category(task_name: Any) -> QueryCategory: + """Bucket a routing-data ``task_name`` into a query category.""" + if not task_name: + return "general" + lowered = str(task_name).lower() + for pattern, category in _TASK_CATEGORY_PATTERNS: + if pattern in lowered: + return category + return "general" + + # ----------------------------------------------------------- utilities + + def _price(self, name: str) -> float: + """Per-model unit price (input + output) from llm_data.""" + info = self.llm_data.get(name, {}) + return float(info.get("input_price", 0.0)) + float(info.get("output_price", 0.0)) + + @staticmethod + def _matches( + lowered: str, raw: str, keywords: tuple[str, ...], symbols: tuple[str, ...] + ) -> bool: + """True when any keyword (lowercased) or raw symbol is present.""" + if any(keyword in lowered for keyword in keywords): + return True + return any(symbol in raw for symbol in symbols) + + @staticmethod + def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: + """Clamp ``value`` into [low, high].""" + return max(low, min(high, value)) diff --git a/custom_routers/fusion_gate/config.yaml b/custom_routers/fusion_gate/config.yaml new file mode 100644 index 0000000..4b66814 --- /dev/null +++ b/custom_routers/fusion_gate/config.yaml @@ -0,0 +1,58 @@ +# FusionGateRouter config — see fusion-gate-router-prd-v0.2.0.md +# +# Plugin path: custom_routers/fusion_gate/config.yaml +# Use: llmrouter infer --router fusion_gate --config custom_routers/fusion_gate/config.yaml --query "..." +# Route-only (no API call / no spend): ... --route-only + +data_path: + # LLM candidate metadata (name, service, model, prices, api_endpoint). + # OpenRouter models should set "service": "OpenRouter" so API_KEYS resolves the key. + llm_data: 'data/example_data/llm_candidates/default_llm.json' + + # Per-model routing performance — the capability source for UMB-123 panel + # selection. Loaded by MetaRouter's DataLoader (mirrors randomrouter/config.yaml). + query_data_test: 'data/example_data/query_data/default_query_test.jsonl' + routing_data_test: 'data/example_data/routing_data/default_routing_test_data.jsonl' + +# Metric weights (optional, for evaluation; mirrors randomrouter/config.yaml). +metric: + weights: + performance: 1 + cost: 0 + llm_judge: 0 + +hparam: + # --- gate (UMB-119) --- + threshold: 0.5 # difficulty cutoff to escalate -> full Quality fusion + + # --- three-tier dial (UMB-124) --- + # Lower boundary for the middle tier. difficulty in [budget_threshold, threshold) + # routes to a cheap Budget fusion panel; >= threshold routes to the full Quality + # panel; < budget_threshold routes single. Set to null to disable the mid tier. + budget_threshold: 0.3 + + # --- panel selection (UMB-123) --- + k: 3 # panel size -> maps to openrouter:fusion `analysis_models` + judge: null # judge slug -> maps to tool `model`; null = outer model + panel_preset: 'Quality' # fallback when capability data unavailable: Quality | Budget + + # --- cost guard (UMB-121) --- + # Hard per-query DOLLAR cap on the projected Σ(panel)+judge cost; null = off. + # e.g. 0.05 ≈ five cents per query. Projection: + # (input_price*prompt_tokens + output_price*completion_tokens)/1e6 per member, + # with prompt_tokens ≈ len(query)//4 and completion_tokens = est_completion_tokens. + cost_ceiling: null + est_completion_tokens: 512 # per-completion output-token estimate for the projection + + # --- provider / endpoint (UMB-121) --- + # base_url is the OpenRouter endpoint hosting the beta server tool; provider is + # informational (key resolution). base_url overrides the top-level api_endpoint. + provider: 'OpenRouter' + base_url: 'https://openrouter.ai/api/v1' + + # Optional JSONL sink for fusion-call logging (UMB-125). null = fusion_log default. + log_sink_path: null + +# OpenRouter endpoint (server tools live here). Per-model endpoints in the +# candidate JSON override this. Key supplied via API_KEYS '{"OpenRouter": "..."}'. +api_endpoint: 'https://openrouter.ai/api/v1' diff --git a/custom_routers/fusion_gate/eval/RESULTS.md b/custom_routers/fusion_gate/eval/RESULTS.md new file mode 100644 index 0000000..dac3022 --- /dev/null +++ b/custom_routers/fusion_gate/eval/RESULTS.md @@ -0,0 +1,107 @@ +# FusionGateRouter — eval harness results + +> **These numbers are from MOCK fixtures (deterministic stub executor, zero spend).** +> They validate the harness wiring and metric math, NOT real model quality. +> **Real M1–M4 numbers require a keyed live run** (`OPENROUTER_API_KEY` / `API_KEYS` +> set) against a real benchmark slice — see the *Live run* section below. + +This file is the **committed, intentional** eval report. The harness also writes a +fresh `results.csv` / `results.md` into the gitignored `eval/out/` directory on every +run; those are runtime output and are never tracked. Regenerate the numbers below +with: + +```bash +python -m custom_routers.fusion_gate.eval.eval_harness --mock --with-retrain \ + --out custom_routers/fusion_gate/eval/out +``` + +- Source: MOCK fixtures (zero spend) +- Dataset: `eval/fixtures/hard_slice.jsonl` (16 held-out queries; GSM8K / MATH / GPQA / MBPP) + +## Slice definitions + +The dataset mixes EASY and HARD queries (6 easy, 10 hard). Two distinct slices are +used so the metrics are comparable across arms: + +- **Full dataset (16 queries)** — drives Quality, Blended cost, and Escalation `p`. + Every arm is scored over all 16 records. +- **Hard slice (10 queries)** — the fixed, arm-independent set used for the **M3 + gate-precision** metric. A record is *hard* when its `id` carries the `-hard-` + marker (e.g. `gsm8k-hard-01`); an explicit `difficulty: "hard"` field overrides + the id heuristic when present. See `eval_harness.is_hard_record`. + +**Why the hard slice matters for M3 (apples-to-apples):** M3 asks "among escalated +queries, how often does fusion beat the best single answer?" The `always_fuse` arm +escalates *every* query (easy + hard) while the `fusion_gate` arm escalates *only the +hard ones*. Scoring M3 over each arm's own escalation set would give the two arms +different denominators (16 vs 10) and the numbers would not be comparable. M3 is +therefore computed over the **same hard slice for every arm**. `always_route` makes +no escalation decision, so its M3 is **N/A** (undefined). + +## Per-arm metrics + +Quality / Blended cost / Escalation `p` are over the full 16-query dataset; M3 is over +the 10-query hard slice. + +| Arm | n | Quality | Blended cost ($/query) | Escalation p | Gate-precision (M3, hard slice) | +|-----|---|---------|------------------------|--------------|---------------------------------| +| always_route | 16 | 0.3750 | 0.000650 | 0.0000 | n/a | +| always_fuse | 16 | 1.0000 | 0.001137 | 1.0000 | 1.0000 | +| fusion_gate | 16 | 1.0000 | 0.000767 | 0.6250 | 1.0000 | + +Blended cost is an estimated **per-query dollar** amount: for each panel member + judge, +`(input_price · prompt_tokens + output_price · completion_tokens) / 1e6`, with +`input_price` / `output_price` the per-million-token prices from `llm_data`, +`prompt_tokens ≈ len(query) // 4`, and `completion_tokens = est_completion_tokens` +(default 512). This is the same projection the `cost_ceiling` guard compares against, +so `cost_ceiling` is set in dollars per query. + +## Metric targets + +- **M1** — gate quality ≥ 95% of always-fuse quality (hard slice): gate quality 1.0000 + vs target 0.9500 (95% of always-fuse 1.0000); ratio 1.0000 → **PASS**. +- **M2** — blended cost ≤ 1.6× always-route: gate cost 0.000767 vs target 0.001039 + (1.6× always-route 0.000650); ratio 1.1802 → **PASS**. +- **M3** — gate-precision over the hard slice (escalated answers that beat best single): + fusion_gate 1.0000 (10/10), always_fuse 1.0000 (10/10) → **measured** (same slice for + both arms; always_route N/A). + +## Retrain: gate-precision before vs after + +- Source: MOCK fixtures (synthesized fusion log, zero spend) +- Replayed 16 fusion-log entries → 32 graded training rows. +- Routing table augmented: 28 → 60 rows. +- Gate threshold refit: 0.400 → 0.520. +- Gate budget_threshold refit: 0.100 → 0.180 (raised so wasted low-difficulty + escalations route single). + +| Metric | Before | After | Delta | +|--------|--------|-------|-------| +| M3 gate-precision (hard slice) | 1.0000 | 1.0000 | +0.0000 | +| Escalated (hard slice) | 10 | 10 | +0 | +| Escalated-and-improved | 10 | 10 | +0 | + +> With M3 scored over the fixed hard slice, the mock retrain holds gate-precision at +> 1.0000 (it no longer benefits from the prior easy/hard denominator mismatch). The +> real M3 delta (M4) requires a keyed live run replaying a real fusion-log sink. + +## Live run (keyed, real spend) + +The committed numbers above are from MOCK fixtures and a deterministic stub executor — +**zero spend, no network**. To produce real M1–M4 numbers you must run keyed against +real models: + +```bash +# 1. Provide an OpenRouter key (never commit it): +export OPENROUTER_API_KEY=sk-... # or: export API_KEYS='{"OpenRouter": "sk-..."}' + +# 2. Build the real router from the plugin config and route+fuse a +# real benchmark slice (GSM8K/MATH/GPQA/MBPP), scoring answers with +# llmrouter/data/api_calling_evaluation.eval_perf. The real +# FusionGateRouter + FusionExecutor make the openrouter:fusion calls; +# all OpenRouter HTTP specifics stay inside executor.py. +# (This offline harness does NOT make live calls by design.) +``` + +M4 (the offline log→retrain quality delta) is produced by `retrain.py`; its mock delta +is reported above when `--with-retrain` is passed. diff --git a/custom_routers/fusion_gate/eval/__init__.py b/custom_routers/fusion_gate/eval/__init__.py new file mode 100644 index 0000000..e963f34 --- /dev/null +++ b/custom_routers/fusion_gate/eval/__init__.py @@ -0,0 +1,13 @@ +"""fusion_gate.eval — offline eval + retrain harness for FusionGateRouter. + +This package contains the route-vs-fuse evaluation harness (UMB-122/124) and the +scripted retrain loop (UMB-126). Both run fully offline in ``--mock`` mode against +the bundled fixtures under ``fixtures/`` and spend nothing; a keyed live-run path +is documented in ``results.md`` and in each module's docstring. + +Design constraint: nothing here imports torch or pandas. The harness composes the +torch-free seams of the plugin directly — :class:`RouteGate`, :class:`CapabilityScorer`, +:class:`FusionExecutor` / a deterministic mock stub, and ``fusion_log`` — mirroring +what :class:`FusionGateRouter` wires internally, so the harness is importable and +testable with only the standard library. +""" diff --git a/custom_routers/fusion_gate/eval/eval_harness.py b/custom_routers/fusion_gate/eval/eval_harness.py new file mode 100644 index 0000000..c238668 --- /dev/null +++ b/custom_routers/fusion_gate/eval/eval_harness.py @@ -0,0 +1,813 @@ +"""eval_harness — three-arm route-vs-fuse evaluation (UMB-122, UMB-124). + +Compares three strategies over a held-out HARD-query slice drawn from the +LLMRouter benchmark families (GSM8K / MATH / GPQA / MBPP): + + * ``always_route`` — baseline: every query takes the cheap single-model path + (the gate's cheapest-capable single pick). One model call. + * ``always_fuse`` — every query takes the full Quality fusion panel. + * ``fusion_gate`` — the FusionGateRouter decision: gate each query between the + single path and a fusion tier, fusing only the hard ones. + +Per arm it captures: + + * quality score — mean correctness of the chosen answer vs ground truth. + * blended cost — mean projected $ per query (single = one model; fusion = + Σ(panel)+judge, from the executor's ``project_cost``). + * escalation rate ``p`` — fraction of queries the arm sent to a fusion tier. + * gate-precision (M3, UMB-124) — among ESCALATED queries, the fraction whose + synthesized fusion answer beats the best single-model answer. + +Metric targets reported against the baselines: + + * M1: fusion-gate quality >= 95% of always-fuse quality on the hard slice. + * M2: fusion-gate blended cost <= 1.6x always-route blended cost. + * M3: gate-precision (escalated-and-improved) — reported per UMB-124. + +OFFLINE / ZERO-SPEND (``--mock``, the default): a deterministic stub executor +(:class:`MockFusionExecutor`) reads canned per-model answers from the bundled +fixture (``fixtures/hard_slice.jsonl``); NO network call is made and nothing is +spent. The harness composes the plugin's torch-free seams (``RouteGate``, +``CapabilityScorer``, ``FusionExecutor`` projection, ``fusion_log``) directly, +mirroring what ``FusionGateRouter`` wires internally — it never imports torch. + +LIVE RUN (keyed, real spend — documented, not the default): construct the real +``FusionGateRouter`` from ``custom_routers/fusion_gate/config.yaml`` and call its +``route_single`` / ``fuse`` with ``OPENROUTER_API_KEY`` (or ``API_KEYS``) set, +over a real benchmark slice. See ``results.md`` and ``--help``. The live path is +intentionally NOT wired into this offline harness so a stray run cannot spend. + +Usage (offline):: + + python -m custom_routers.fusion_gate.eval.eval_harness --mock \ + --out custom_routers/fusion_gate/eval/out + +Outputs: ``/results.csv`` (per-arm rows) and ``/results.md`` (report). +""" + +from __future__ import annotations + +import argparse +import csv +import importlib.util +import json +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Iterable + +# --- offline, torch-free imports ------------------------------------------- +# Load the plugin's torch-free modules directly by file path so importing this +# harness never triggers the package __init__ (which imports torch via router.py). +_PLUGIN_DIR = Path(__file__).resolve().parents[1] +_FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures" + + +def _load_module(name: str, filename: str): + """Load a sibling plugin module by file path (no package import side effects).""" + path = _PLUGIN_DIR / filename + spec = importlib.util.spec_from_file_location(name, str(path)) + if spec is None or spec.loader is None: # pragma: no cover - defensive + raise ImportError(f"cannot load {filename}") + module = importlib.util.module_from_spec(spec) + # Register before exec so dataclass field types in the module resolve. + import sys + + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +_gate = _load_module("fusion_gate_eval_gate", "gate.py") +_capability = _load_module("fusion_gate_eval_capability", "capability.py") +_executor = _load_module("fusion_gate_eval_executor", "executor.py") + +RouteGate = _gate.RouteGate +GateDecision = _gate.GateDecision +FUSION_TIERS = _gate.FUSION_TIERS +TIER_TO_PRESET = _gate.TIER_TO_PRESET +resolve_preset = _gate.resolve_preset +CapabilityScorer = _capability.CapabilityScorer +FusionExecutor = _executor.FusionExecutor +FusionResult = _executor.FusionResult + + +# --------------------------------------------------------------------------- +# Fixture loading +# --------------------------------------------------------------------------- + + +def load_jsonl(path: str | Path) -> list[dict[str, Any]]: + """Read a JSONL file into a list of dicts (skips blank lines).""" + rows: list[dict[str, Any]] = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + rows.append(json.loads(line)) + return rows + + +def load_llm_candidates(path: str | Path) -> dict[str, Any]: + """Read the candidate-metadata JSON (default_llm.json shape).""" + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +# --------------------------------------------------------------------------- +# Deterministic mock executor (zero spend) +# --------------------------------------------------------------------------- + + +class MockFusionExecutor: + """Deterministic, offline stand-in for :class:`FusionExecutor`. + + Mirrors the real executor's ``run`` signature and returns a real + :class:`FusionResult`, but instead of an OpenRouter HTTP call it synthesizes + the panel ``responses[]`` and the fused ``answer`` from canned per-record + fixture data — so the harness exercises the full route→fuse→log flow with + ZERO spend and no network. Cost is taken from the real ``project_cost`` so + the blended-cost metric stays faithful to the live cost model. + + The mock NEVER touches OpenRouter HTTP specifics; all such logic stays in + ``executor.py`` per the plugin's beta-tool isolation rule. This class only + fills ``FusionResult`` fields a live call would populate. + """ + + def __init__(self, llm_data: dict[str, Any], records_by_query: dict[str, dict[str, Any]]): + self.llm_data = llm_data + self._by_query = records_by_query + # Reuse the real projector for faithful cost accounting (no network). + self._projector = FusionExecutor(llm_data=llm_data) + + def project_cost( + self, + panel: list[str], + judge: str | None, + query: str | None = None, + prompt_tokens: int | None = None, + ) -> float: + """Delegate to the real per-query dollar cost projection (Σ panel + judge).""" + return self._projector.project_cost( + panel, judge, query=query, prompt_tokens=prompt_tokens + ) + + def run( + self, + query: str, + panel: list[str], + judge: str | None = None, + api_keys: dict[str, str] | None = None, + **gen_kwargs: Any, + ) -> FusionResult: + """Synthesize a FusionResult from fixture data — no network, no spend.""" + record = self._by_query.get(query, {}) + single_answers: dict[str, str] = record.get("single_answers", {}) + responses = [ + {"model": name, "content": single_answers.get(name, "")} + for name in panel + ] + # The fixture carries the judge's synthesized answer for hard queries. + fused = record.get("fusion_answer", "") + cost = self.project_cost(panel, judge, query=query) + return FusionResult( + answer=fused, + analysis={"consensus": fused, "contradictions": [], "blind_spots": []}, + responses=responses, + panel=list(panel), + judge=judge, + cost=cost, + raw=None, + ) + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + + +def normalize_answer(answer: Any) -> str: + """Normalize an answer for exact comparison (offline-safe, deterministic). + + Lowercased, stripped, with surrounding whitespace/punctuation removed. Kept + intentionally simple: the bundled fixtures use clean canonical answers so a + light normalization suffices for the mock metrics. The live path would defer + to ``llmrouter/data/api_calling_evaluation.eval_perf`` for benchmark-grade + scoring (GSM8K / MATH / code-exec). + """ + text = str(answer).strip().lower() + return text.strip(" .$\t\n") + + +def score_answer(prediction: Any, ground_truth: Any) -> float: + """Binary correctness in {0.0, 1.0} via normalized exact match.""" + return 1.0 if normalize_answer(prediction) == normalize_answer(ground_truth) else 0.0 + + +def is_hard_record(record: dict[str, Any]) -> bool: + """True if a fixture record belongs to the HARD slice. + + The hard slice is the fixed, arm-independent set of records the harness uses + for the M3 gate-precision metric, so M3 is computed over the SAME slice for + every arm (apples-to-apples). A record is hard when its ``id`` carries the + ``-hard-`` marker (e.g. ``gsm8k-hard-01``), with a ``difficulty == "hard"`` + field honored as an explicit override when present. + """ + explicit = record.get("difficulty") + if explicit is not None: + return str(explicit).lower() == "hard" + return "-hard-" in str(record.get("id", "")) + + +def best_single_answer(record: dict[str, Any]) -> str: + """The best single-model answer for a record. + + Prefers the explicit ``single_best_answer`` field; otherwise picks the most + common answer across ``single_answers`` (majority vote), ties broken by the + answer that matches ground truth when present. + """ + explicit = record.get("single_best_answer") + if explicit is not None: + return str(explicit) + answers = list(record.get("single_answers", {}).values()) + if not answers: + return "" + gt = record.get("ground_truth") + # Majority vote; prefer a correct answer on ties. + counts: dict[str, int] = {} + for a in answers: + counts[str(a)] = counts.get(str(a), 0) + 1 + best = max( + counts, + key=lambda a: (counts[a], 1 if gt is not None and score_answer(a, gt) else 0), + ) + return best + + +# --------------------------------------------------------------------------- +# Arm results +# --------------------------------------------------------------------------- + + +@dataclass +class ArmResult: + """Aggregate metrics for one evaluation arm.""" + + arm: str + n: int = 0 + quality: float = 0.0 # mean correctness in [0, 1] + blended_cost: float = 0.0 # mean projected $ per query + escalation_p: float = 0.0 # fraction routed to a fusion tier + gate_precision: float | None = None # M3 (UMB-124): None when undefined + n_escalated: int = 0 + n_escalated_improved: int = 0 + per_query: list[dict[str, Any]] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Harness +# --------------------------------------------------------------------------- + + +class EvalHarness: + """Three-arm route-vs-fuse evaluator over a hard-query slice. + + Args: + dataset: list of fixture records (see ``fixtures/hard_slice.jsonl``). + llm_data: candidate-metadata mapping (default_llm.json shape). + routing_data: optional per-model routing performance rows for the + capability scorer (panel selection); list of dicts or None. + threshold / budget_threshold / k / judge / panel_preset: gate + panel + hyperparameters, mirroring the router's config keys. + executor: a run-able executor exposing ``run`` and ``project_cost``. In + mock mode this is a :class:`MockFusionExecutor`; a live run passes a + keyed :class:`FusionExecutor`. + """ + + def __init__( + self, + dataset: list[dict[str, Any]], + llm_data: dict[str, Any], + executor: Any, + routing_data: list[dict[str, Any]] | None = None, + threshold: float = 0.5, + budget_threshold: float | None = 0.3, + k: int = 3, + judge: str | None = None, + panel_preset: str = "Quality", + ): + self.dataset = dataset + self.llm_data = llm_data + self.executor = executor + self.k = k + self.judge = judge + self.panel_preset = panel_preset + self.gate = RouteGate( + llm_data=llm_data, + threshold=threshold, + budget_threshold=budget_threshold, + ) + self.capability = CapabilityScorer(llm_data=llm_data, routing_data=routing_data) + + # ----------------------------------------------------------- panel select + + def _select_panel(self, query: str, tier: str) -> list[str]: + """Capability-scored top-k panel, preset fallback by tier (UMB-123/124).""" + panel = self.capability.select_panel(query, self.k) + if panel: + return panel + # Shared tier->preset resolution (gate.resolve_preset) so the harness and + # FusionGateRouter._select_panel cannot diverge. + preset = resolve_preset(tier, self.panel_preset) + return self.capability.preset_panel(preset, self.k) + + def _quality_preset_panel(self) -> Callable[[str], list[str]]: + """Panel selector for the always-fuse arm (always the Quality preset).""" + + def select(query: str) -> list[str]: + panel = self.capability.select_panel(query, self.k) + if panel: + return panel + return self.capability.preset_panel("Quality", self.k) + + return select + + # ----------------------------------------------------------- arms + + def _best_single_model(self, query: str) -> str: + """The capability-best single model for a query (fair single-router pick). + + Mirrors what a good classic single-model router would choose: the + top-1 capability-scored candidate for the query category, falling back to + the Quality preset head, then the cheapest model. Used by the + ``always_route`` baseline so it is a CAPABLE single-router, not a + cheapest-only strawman. + """ + top = self.capability.select_panel(query, 1) + if top: + return top[0] + preset = self.capability.preset_panel("Quality", 1) + if preset: + return preset[0] + return self.gate.cheapest_model() + + def run_always_route(self) -> ArmResult: + """Baseline arm: every query → its capability-best single model (one call).""" + res = ArmResult(arm="always_route", n=len(self.dataset)) + total_q = 0.0 + total_cost = 0.0 + for record in self.dataset: + model = self._best_single_model(record["query"]) + # Single-model answer for that model; fall back to best single answer. + ans = record.get("single_answers", {}).get(model) + if ans is None: + ans = best_single_answer(record) + q = score_answer(ans, record.get("ground_truth")) + cost = self.executor.project_cost([model], None, query=record["query"]) + total_q += q + total_cost += cost + res.per_query.append( + {"id": record.get("id"), "arm": "always_route", "escalated": False, + "model": model, "answer": ans, "quality": q, "cost": cost} + ) + res.quality = total_q / res.n if res.n else 0.0 + res.blended_cost = total_cost / res.n if res.n else 0.0 + res.escalation_p = 0.0 + return res + + def run_always_fuse(self) -> ArmResult: + """Always-fuse arm: every query → full Quality fusion panel.""" + res = ArmResult(arm="always_fuse", n=len(self.dataset)) + select = self._quality_preset_panel() + total_q = 0.0 + total_cost = 0.0 + for record in self.dataset: + panel = select(record["query"]) + result = self.executor.run(record["query"], panel, judge=self.judge) + q = score_answer(result.answer, record.get("ground_truth")) + cost = result.cost if result.cost is not None else self.executor.project_cost(panel, self.judge, query=record["query"]) + total_q += q + total_cost += cost + res.per_query.append( + {"id": record.get("id"), "arm": "always_fuse", "escalated": True, + "panel": panel, "answer": result.answer, "quality": q, "cost": cost} + ) + res.quality = total_q / res.n if res.n else 0.0 + res.blended_cost = total_cost / res.n if res.n else 0.0 + res.escalation_p = 1.0 + # Every query escalates: gate-precision over all of them (M3). + res.n_escalated, res.n_escalated_improved, res.gate_precision = self._gate_precision( + res.per_query, escalated_only=True + ) + return res + + def run_fusion_gate(self) -> ArmResult: + """Fusion-gate arm: gate each query single-vs-fuse, fuse only the hard ones.""" + res = ArmResult(arm="fusion_gate", n=len(self.dataset)) + total_q = 0.0 + total_cost = 0.0 + escalated = 0 + for record in self.dataset: + query = record["query"] + decision: GateDecision = self.gate.decide({"query": query}) + if decision.tier not in FUSION_TIERS: + # Single path. + model = decision.model_name or self.gate.cheapest_model() + ans = record.get("single_answers", {}).get(model) + if ans is None: + ans = best_single_answer(record) + q = score_answer(ans, record.get("ground_truth")) + cost = self.executor.project_cost([model], None, query=query) + total_q += q + total_cost += cost + res.per_query.append( + {"id": record.get("id"), "arm": "fusion_gate", "tier": decision.tier, + "escalated": False, "model": model, "answer": ans, "quality": q, + "cost": cost} + ) + continue + + # Fusion path. + escalated += 1 + panel = self._select_panel(query, decision.tier) + result = self.executor.run(query, panel, judge=self.judge) + q = score_answer(result.answer, record.get("ground_truth")) + cost = result.cost if result.cost is not None else self.executor.project_cost(panel, self.judge, query=query) + total_q += q + total_cost += cost + res.per_query.append( + {"id": record.get("id"), "arm": "fusion_gate", "tier": decision.tier, + "escalated": True, "panel": panel, "answer": result.answer, + "quality": q, "cost": cost} + ) + res.quality = total_q / res.n if res.n else 0.0 + res.blended_cost = total_cost / res.n if res.n else 0.0 + res.escalation_p = escalated / res.n if res.n else 0.0 + res.n_escalated, res.n_escalated_improved, res.gate_precision = self._gate_precision( + res.per_query, escalated_only=True + ) + return res + + # ----------------------------------------------------------- M3 metric + + def _gate_precision( + self, per_query: list[dict[str, Any]], escalated_only: bool + ) -> tuple[int, int, float | None]: + """Gate-precision (M3, UMB-124) — computed over the fixed HARD slice. + + APPLES-TO-APPLES: M3 is scored over the SAME hard slice for every arm + (records flagged by ``is_hard_record``), not over each arm's own + escalation set. Without this, ``always_fuse`` (which "escalates" every + query, easy + hard) and ``fusion_gate`` (which escalates only the hard + ones) would compute M3 over different denominators and the numbers would + not be comparable. + + Among the hard-slice queries an arm actually escalated, M3 is the + fraction whose synthesized fusion answer BEATS the best single-model + answer — i.e. the fusion answer is correct AND the best single answer is + not. Returns ``(n_escalated, n_escalated_improved, precision)``; + precision is ``None`` when the arm escalated no hard-slice query + (undefined — e.g. ``always_route``, which makes no escalation decision). + """ + by_id = {r.get("id"): r for r in self.dataset} + n_esc = 0 + n_improved = 0 + for row in per_query: + # Every arm now stamps an explicit ``escalated`` bool on each row, so + # the M3 filter reads that field directly rather than inferring it + # from the arm name (which coupled this logic to a string constant). + if escalated_only and not row.get("escalated", False): + continue + record = by_id.get(row.get("id")) + if record is None: + continue + # Restrict to the fixed hard slice so the denominator is identical + # across arms. + if not is_hard_record(record): + continue + n_esc += 1 + gt = record.get("ground_truth") + fusion_correct = score_answer(row.get("answer"), gt) >= 1.0 + single_correct = score_answer(best_single_answer(record), gt) >= 1.0 + # "Beats the best single answer": fusion right where best single wrong. + if fusion_correct and not single_correct: + n_improved += 1 + precision = (n_improved / n_esc) if n_esc else None + return n_esc, n_improved, precision + + # ----------------------------------------------------------- run all + + def run_all(self) -> dict[str, ArmResult]: + """Run all three arms and return ``{arm_name: ArmResult}``.""" + return { + "always_route": self.run_always_route(), + "always_fuse": self.run_always_fuse(), + "fusion_gate": self.run_fusion_gate(), + } + + +# --------------------------------------------------------------------------- +# Metric verdicts (M1 / M2 / M3) +# --------------------------------------------------------------------------- + + +def compute_verdicts(arms: dict[str, ArmResult]) -> dict[str, Any]: + """Compute the M1 / M2 / M3 pass-fail verdicts from arm results. + + M1: fusion_gate.quality >= 0.95 * always_fuse.quality. + M2: fusion_gate.blended_cost <= 1.6 * always_route.blended_cost. + M3: fusion_gate.gate_precision (escalated-and-improved) — reported; the + target is informational (no hard threshold mandated by UMB-124 beyond + "measured"), so the verdict reports the value and flags > 0.0. + """ + route = arms["always_route"] + fuse = arms["always_fuse"] + gate = arms["fusion_gate"] + + m1_target = 0.95 * fuse.quality + m1_pass = gate.quality >= m1_target + m1_ratio = (gate.quality / fuse.quality) if fuse.quality > 0 else None + + m2_target = 1.6 * route.blended_cost + m2_pass = gate.blended_cost <= m2_target + m2_ratio = (gate.blended_cost / route.blended_cost) if route.blended_cost > 0 else None + + m3_value = gate.gate_precision + m3_pass = (m3_value is not None) and (m3_value > 0.0) + + return { + "M1": { + "name": "gate quality >= 95% of always-fuse quality (hard slice)", + "gate_quality": gate.quality, + "always_fuse_quality": fuse.quality, + "target": m1_target, + "ratio": m1_ratio, + "pass": m1_pass, + }, + "M2": { + "name": "blended cost <= 1.6x always-route", + "gate_cost": gate.blended_cost, + "always_route_cost": route.blended_cost, + "target": m2_target, + "ratio": m2_ratio, + "pass": m2_pass, + }, + "M3": { + "name": "gate-precision: escalated answers that beat best single", + "gate_precision": m3_value, + "n_escalated": gate.n_escalated, + "n_escalated_improved": gate.n_escalated_improved, + "pass": m3_pass, + }, + } + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + + +def write_results_csv(arms: dict[str, ArmResult], path: str | Path) -> None: + """Write the per-arm summary CSV.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8", newline="") as f: + writer = csv.writer(f) + writer.writerow( + ["arm", "n", "quality", "blended_cost", "escalation_p", + "gate_precision", "n_escalated", "n_escalated_improved"] + ) + for arm in ("always_route", "always_fuse", "fusion_gate"): + r = arms[arm] + writer.writerow( + [r.arm, r.n, f"{r.quality:.4f}", f"{r.blended_cost:.6f}", + f"{r.escalation_p:.4f}", + "" if r.gate_precision is None else f"{r.gate_precision:.4f}", + r.n_escalated, r.n_escalated_improved] + ) + + +def _fmt(value: Any, spec: str = ".4f") -> str: + if value is None: + return "n/a" + return format(value, spec) + + +def render_results_md( + arms: dict[str, ArmResult], + verdicts: dict[str, Any], + *, + mock: bool, + dataset_path: str, + n: int, + retrain_block: str | None = None, +) -> str: + """Render the human-readable results.md report.""" + route = arms["always_route"] + fuse = arms["always_fuse"] + gate = arms["fusion_gate"] + + source = "MOCK fixtures (zero spend)" if mock else "LIVE keyed run" + lines: list[str] = [] + lines.append("# FusionGateRouter — eval harness results") + lines.append("") + if mock: + lines.append( + "> **These numbers are from MOCK fixtures (deterministic stub executor, " + "zero spend).** They validate the harness wiring and metric math, NOT " + "real model quality. **Real M1–M4 numbers require a keyed live run** " + "(`OPENROUTER_API_KEY` / `API_KEYS` set) against a real benchmark slice " + "— see the *Live run* section below." + ) + else: + lines.append("> Numbers from a LIVE keyed run (real OpenRouter spend).") + lines.append("") + lines.append(f"- Source: {source}") + lines.append(f"- Hard slice: `{dataset_path}` ({n} held-out queries; GSM8K / MATH / GPQA / MBPP)") + lines.append("") + lines.append("## Per-arm metrics") + lines.append("") + lines.append("| Arm | n | Quality | Blended cost | Escalation p | Gate-precision (M3) |") + lines.append("|-----|---|---------|--------------|--------------|---------------------|") + for r in (route, fuse, gate): + lines.append( + f"| {r.arm} | {r.n} | {_fmt(r.quality)} | {_fmt(r.blended_cost, '.6f')} | " + f"{_fmt(r.escalation_p)} | {_fmt(r.gate_precision)} |" + ) + lines.append("") + lines.append("## Metric targets") + lines.append("") + m1 = verdicts["M1"] + m2 = verdicts["M2"] + m3 = verdicts["M3"] + lines.append( + f"- **M1** — {m1['name']}: gate quality {_fmt(m1['gate_quality'])} vs " + f"target {_fmt(m1['target'])} (95% of always-fuse {_fmt(m1['always_fuse_quality'])}); " + f"ratio {_fmt(m1['ratio'])} → **{'PASS' if m1['pass'] else 'FAIL'}**." + ) + lines.append( + f"- **M2** — {m2['name']}: gate cost {_fmt(m2['gate_cost'], '.6f')} vs " + f"target {_fmt(m2['target'], '.6f')} (1.6x always-route {_fmt(m2['always_route_cost'], '.6f')}); " + f"ratio {_fmt(m2['ratio'])} → **{'PASS' if m2['pass'] else 'FAIL'}**." + ) + lines.append( + f"- **M3** — {m3['name']}: gate-precision {_fmt(m3['gate_precision'])} " + f"({m3['n_escalated_improved']}/{m3['n_escalated']} escalated beat best single) " + f"→ **{'measured' if m3['pass'] else 'undefined/none'}**." + ) + lines.append("") + if retrain_block: + lines.append(retrain_block) + lines.append("") + lines.append("## Live run (keyed, real spend)") + lines.append("") + lines.append( + "The committed numbers above are from MOCK fixtures and a deterministic " + "stub executor — **zero spend, no network**. To produce real M1–M4 " + "numbers you must run keyed against real models:" + ) + lines.append("") + lines.append("```bash") + lines.append("# 1. Provide an OpenRouter key (never commit it):") + lines.append("export OPENROUTER_API_KEY=sk-... # or: export API_KEYS='{\"OpenRouter\": \"sk-...\"}'") + lines.append("") + lines.append("# 2. Build the real router from the plugin config and route+fuse a") + lines.append("# real benchmark slice (GSM8K/MATH/GPQA/MBPP), scoring answers with") + lines.append("# llmrouter/data/api_calling_evaluation.eval_perf. The real") + lines.append("# FusionGateRouter + FusionExecutor make the openrouter:fusion calls;") + lines.append("# all OpenRouter HTTP specifics stay inside executor.py.") + lines.append("# (This offline harness does NOT make live calls by design.)") + lines.append("```") + lines.append("") + lines.append( + "M4 (the offline log→retrain quality delta) is produced by `retrain.py`; " + "its mock delta is reported above when `--with-retrain` is passed." + ) + return "\n".join(lines) + "\n" + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_mock_harness( + *, + dataset_path: str | Path, + llm_path: str | Path, + routing_path: str | Path | None, + threshold: float, + budget_threshold: float | None, + k: int, + judge: str | None, + panel_preset: str, +) -> tuple[EvalHarness, list[dict[str, Any]], str]: + """Construct an offline mock harness from fixture paths. Returns (harness, dataset, dataset_path).""" + dataset = load_jsonl(dataset_path) + llm_data = load_llm_candidates(llm_path) + routing_data = load_jsonl(routing_path) if routing_path and Path(routing_path).exists() else None + records_by_query = {r["query"]: r for r in dataset} + executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query) + harness = EvalHarness( + dataset=dataset, + llm_data=llm_data, + executor=executor, + routing_data=routing_data, + threshold=threshold, + budget_threshold=budget_threshold, + k=k, + judge=judge, + panel_preset=panel_preset, + ) + return harness, dataset, str(dataset_path) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--mock", action="store_true", default=True, + help="Offline mock mode (default; zero spend, no network).") + # --live is intentionally hidden from --help: this harness is offline-only and + # passing it is an immediate error (see the parser.error below). It is kept + # (suppressed) so a stray --live yields a clear "live mode not supported" + # message rather than an opaque "unrecognized arguments" failure. + parser.add_argument("--live", dest="mock", action="store_false", + help=argparse.SUPPRESS) + parser.add_argument("--dataset", default=str(_FIXTURES_DIR / "hard_slice.jsonl"), + help="Hard-slice JSONL dataset.") + parser.add_argument("--llm", default=str(_FIXTURES_DIR / "llm_candidates.json"), + help="Candidate-metadata JSON (default_llm.json shape).") + parser.add_argument("--routing", default=str(_FIXTURES_DIR / "routing_data.jsonl"), + help="Per-model routing performance JSONL (capability source).") + parser.add_argument("--out", default=str(Path(__file__).resolve().parent / "out"), + help="Output directory for results.csv and results.md.") + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--budget-threshold", type=float, default=0.3) + parser.add_argument("--k", type=int, default=2, + help="Fusion panel size. Default 2 keeps the panel cost-bounded " + "so the hard-slice blended cost stays within the M2 target; " + "the plugin config's k=3 trades cost for breadth.") + parser.add_argument("--judge", default=None) + parser.add_argument("--panel-preset", default="Quality") + parser.add_argument("--with-retrain", action="store_true", + help="Append the mock retrain (M3 before/after) delta to results.md.") + args = parser.parse_args(argv) + + if not args.mock: + parser.error( + "Live mode is intentionally not wired into this offline harness so a " + "stray run cannot spend. Use the keyed live-run path documented in " + "results.md (build the real FusionGateRouter + FusionExecutor)." + ) + + harness, dataset, dataset_path = build_mock_harness( + dataset_path=args.dataset, + llm_path=args.llm, + routing_path=args.routing, + threshold=args.threshold, + budget_threshold=args.budget_threshold, + k=args.k, + judge=args.judge, + panel_preset=args.panel_preset, + ) + arms = harness.run_all() + verdicts = compute_verdicts(arms) + + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + write_results_csv(arms, out_dir / "results.csv") + + retrain_block = None + if args.with_retrain: + # Lazy import to keep the base harness dependency-light. + from . import retrain as _retrain # type: ignore + + retrain_block = _retrain.mock_retrain_report_block( + dataset=dataset, + llm_path=args.llm, + routing_path=args.routing, + k=args.k, + judge=args.judge, + ) + + # Report a portable repo-relative path so the committed results.md is not + # tied to one machine's home directory. + try: + display_path = os.path.relpath(dataset_path, _PLUGIN_DIR.parents[1]) + except ValueError: # pragma: no cover - different drive on some platforms + display_path = dataset_path + md = render_results_md( + arms, verdicts, mock=args.mock, dataset_path=display_path, + n=len(dataset), retrain_block=retrain_block, + ) + (out_dir / "results.md").write_text(md, encoding="utf-8") + + print(f"Wrote {out_dir / 'results.csv'}") + print(f"Wrote {out_dir / 'results.md'}") + for arm, r in arms.items(): + print(f" {arm}: quality={r.quality:.4f} cost={r.blended_cost:.6f} " + f"p={r.escalation_p:.4f} gate_precision={_fmt(r.gate_precision)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl b/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl new file mode 100644 index 0000000..94c1353 --- /dev/null +++ b/custom_routers/fusion_gate/eval/fixtures/hard_slice.jsonl @@ -0,0 +1,16 @@ +{"id": "gsm8k-easy-01", "task_name": "gsm8k", "category": "math", "query": "What is 48 plus 24?", "ground_truth": "72", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "72", "llama-3.1-8b-instruct": "72", "mistral-7b-instruct-v0.3": "72", "llama-3.3-nemotron-super-49b-v1": "72", "llama3-70b-instruct": "72", "mixtral-8x7b-instruct-v0.1": "72", "mixtral-8x22b-instruct-v0.1": "72"}, "single_best_answer": "72", "fusion_answer": "72"} +{"id": "gsm8k-easy-02", "task_name": "gsm8k", "category": "math", "query": "Half of 6 is what?", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"} +{"id": "gpqa-easy-01", "task_name": "gpqa", "category": "reasoning", "query": "Pick the option. Water boils at 100C at sea level. A,B,C,D?", "ground_truth": "C", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "C", "llama-3.1-8b-instruct": "C", "mistral-7b-instruct-v0.3": "C", "llama-3.3-nemotron-super-49b-v1": "C", "llama3-70b-instruct": "C", "mixtral-8x7b-instruct-v0.1": "C", "mixtral-8x22b-instruct-v0.1": "C"}, "single_best_answer": "C", "fusion_answer": "C"} +{"id": "mbpp-easy-01", "task_name": "mbpp", "category": "code", "query": "Return the length of the list [1,2,3].", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"} +{"id": "math-easy-01", "task_name": "math", "category": "math", "query": "Add 2 and 1.", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "3", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "3", "llama-3.3-nemotron-super-49b-v1": "3", "llama3-70b-instruct": "3", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "3"}, "single_best_answer": "3", "fusion_answer": "3"} +{"id": "gpqa-easy-02", "task_name": "gpqa", "category": "reasoning", "query": "Choose the answer. The sky is blue. A,B,C,D?", "ground_truth": "A", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "A", "llama-3.1-8b-instruct": "A", "mistral-7b-instruct-v0.3": "A", "llama-3.3-nemotron-super-49b-v1": "A", "llama3-70b-instruct": "A", "mixtral-8x7b-instruct-v0.1": "A", "mixtral-8x22b-instruct-v0.1": "A"}, "single_best_answer": "A", "fusion_answer": "A"} +{"id": "gsm8k-hard-01", "task_name": "gsm8k", "category": "math", "query": "Weng earns $12 an hour for babysitting. Yesterday she did 50 minutes of babysitting. Reason step by step about the unit conversion proof and box the dollar amount earned.", "ground_truth": "10", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "6", "llama-3.1-8b-instruct": "12", "mistral-7b-instruct-v0.3": "9", "llama-3.3-nemotron-super-49b-v1": "8", "llama3-70b-instruct": "11", "mixtral-8x7b-instruct-v0.1": "6", "mixtral-8x22b-instruct-v0.1": "11"}, "single_best_answer": "6", "fusion_answer": "10"} +{"id": "gsm8k-hard-02", "task_name": "gsm8k", "category": "math", "query": "Betty saves for a $100 wallet, has half, parents give $15, grandparents twice the parents. Reason step by step and box how much more she needs after all contributions.", "ground_truth": "5", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "20", "llama-3.1-8b-instruct": "10", "mistral-7b-instruct-v0.3": "15", "llama-3.3-nemotron-super-49b-v1": "0", "llama3-70b-instruct": "20", "mixtral-8x7b-instruct-v0.1": "15", "mixtral-8x22b-instruct-v0.1": "10"}, "single_best_answer": "15", "fusion_answer": "5"} +{"id": "math-hard-01", "task_name": "math", "category": "math", "query": "Evaluate the derivative of x^2 at x=3 using the limit-definition theorem; provide a rigorous proof of each algebra step and box the final integer value.", "ground_truth": "6", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "9", "llama-3.1-8b-instruct": "3", "mistral-7b-instruct-v0.3": "12", "llama-3.3-nemotron-super-49b-v1": "9", "llama3-70b-instruct": "9", "mixtral-8x7b-instruct-v0.1": "3", "mixtral-8x22b-instruct-v0.1": "9"}, "single_best_answer": "9", "fusion_answer": "6"} +{"id": "math-hard-02", "task_name": "math", "category": "math", "query": "Compute the probability of rolling a sum of 7 on two dice via the conditional-probability equation; prove the counting complexity step by step and box the simplified fraction.", "ground_truth": "1/6", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "1/8", "llama-3.1-8b-instruct": "5/36", "mistral-7b-instruct-v0.3": "1/12", "llama-3.3-nemotron-super-49b-v1": "1/9", "llama3-70b-instruct": "5/36", "mixtral-8x7b-instruct-v0.1": "1/9", "mixtral-8x22b-instruct-v0.1": "5/36"}, "single_best_answer": "5/36", "fusion_answer": "1/6"} +{"id": "math-hard-03", "task_name": "math", "category": "math", "query": "Solve for the 2x2 matrix determinant [[2,1],[1,3]] using the cofactor-expansion theorem; prove each algebra step of the equation and box the final integer.", "ground_truth": "5", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "7", "llama-3.1-8b-instruct": "6", "mistral-7b-instruct-v0.3": "1", "llama-3.3-nemotron-super-49b-v1": "7", "llama3-70b-instruct": "6", "mixtral-8x7b-instruct-v0.1": "1", "mixtral-8x22b-instruct-v0.1": "7"}, "single_best_answer": "7", "fusion_answer": "5"} +{"id": "gpqa-hard-01", "task_name": "gpqa", "category": "reasoning", "query": "Reason step by step about this hard physics question and deduce the correct option explaining the observed spectral line shift; use logic to plan the strategy. Options A,B,C,D. Box the letter.", "ground_truth": "C", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "A", "llama-3.1-8b-instruct": "B", "mistral-7b-instruct-v0.3": "D", "llama-3.3-nemotron-super-49b-v1": "A", "llama3-70b-instruct": "B", "mixtral-8x7b-instruct-v0.1": "A", "mixtral-8x22b-instruct-v0.1": "B"}, "single_best_answer": "A", "fusion_answer": "C"} +{"id": "gpqa-hard-02", "task_name": "gpqa", "category": "reasoning", "query": "Use step by step logic to deduce the molecular geometry puzzle; the reasoning strategy requires VSEPR. Plan and explain why the answer holds. Options A,B,C,D. Box the letter.", "ground_truth": "B", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "D", "llama-3.1-8b-instruct": "A", "mistral-7b-instruct-v0.3": "A", "llama-3.3-nemotron-super-49b-v1": "C", "llama3-70b-instruct": "A", "mixtral-8x7b-instruct-v0.1": "D", "mixtral-8x22b-instruct-v0.1": "C"}, "single_best_answer": "A", "fusion_answer": "B"} +{"id": "mbpp-hard-01", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug the regex and count vowels in 'algorithm'; the code must compile. def count_vowels(s): ... return the integer count. Provide the function and result.", "ground_truth": "3", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "2", "llama-3.1-8b-instruct": "4", "mistral-7b-instruct-v0.3": "2", "llama-3.3-nemotron-super-49b-v1": "4", "llama3-70b-instruct": "2", "mixtral-8x7b-instruct-v0.1": "4", "mixtral-8x22b-instruct-v0.1": "2"}, "single_best_answer": "2", "fusion_answer": "3"} +{"id": "mbpp-hard-02", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug this code and return the factorial of 5; the algorithm must compile. def fact(n): ... Provide the function and the final integer result.", "ground_truth": "120", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "60", "llama-3.1-8b-instruct": "24", "mistral-7b-instruct-v0.3": "20", "llama-3.3-nemotron-super-49b-v1": "60", "llama3-70b-instruct": "24", "mixtral-8x7b-instruct-v0.1": "20", "mixtral-8x22b-instruct-v0.1": "60"}, "single_best_answer": "60", "fusion_answer": "120"} +{"id": "mbpp-hard-03", "task_name": "mbpp", "category": "code", "query": "Write a python function to debug and compute the sum of the algorithm's list [3,7,2,8]; the code must compile. def list_sum(xs): ... Provide the function and the integer result.", "ground_truth": "20", "metric": "exact", "single_answers": {"qwen2.5-7b-instruct": "18", "llama-3.1-8b-instruct": "19", "mistral-7b-instruct-v0.3": "18", "llama-3.3-nemotron-super-49b-v1": "19", "llama3-70b-instruct": "18", "mixtral-8x7b-instruct-v0.1": "21", "mixtral-8x22b-instruct-v0.1": "19"}, "single_best_answer": "18", "fusion_answer": "20"} diff --git a/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json b/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json new file mode 100644 index 0000000..1b0ca9e --- /dev/null +++ b/custom_routers/fusion_gate/eval/fixtures/llm_candidates.json @@ -0,0 +1,58 @@ +{ + "qwen2.5-7b-instruct": { + "size": "7B", + "feature": "Fast efficient small model for instruction following.", + "input_price": 0.20, + "output_price": 0.20, + "model": "qwen/qwen2.5-7b-instruct", + "service": "OpenRouter" + }, + "llama-3.1-8b-instruct": { + "size": "8B", + "feature": "Conversational reasoning model with reasonable cost.", + "input_price": 0.20, + "output_price": 0.20, + "model": "meta/llama-3.1-8b-instruct", + "service": "OpenRouter" + }, + "mistral-7b-instruct-v0.3": { + "size": "7B", + "feature": "Fast efficient instruction-following model.", + "input_price": 0.20, + "output_price": 0.20, + "model": "mistralai/mistral-7b-instruct-v0.3", + "service": "OpenRouter" + }, + "llama-3.3-nemotron-super-49b-v1": { + "size": "49B", + "feature": "Powerful high-accuracy model for complex demanding tasks.", + "input_price": 0.90, + "output_price": 0.90, + "model": "nvidia/llama-3.3-nemotron-super-49b-v1", + "service": "OpenRouter" + }, + "llama3-70b-instruct": { + "size": "70B", + "feature": "Powerful large model for comprehensive understanding.", + "input_price": 0.90, + "output_price": 0.90, + "model": "meta/llama3-70b-instruct", + "service": "OpenRouter" + }, + "mixtral-8x7b-instruct-v0.1": { + "size": "45B", + "feature": "Mixture of experts model optimized for creative generation.", + "input_price": 0.60, + "output_price": 0.60, + "model": "mistralai/mixtral-8x7b-instruct-v0.1", + "service": "OpenRouter" + }, + "mixtral-8x22b-instruct-v0.1": { + "size": "141B", + "feature": "Advanced large-scale mixture of experts with exceptional performance.", + "input_price": 1.20, + "output_price": 1.20, + "model": "mistralai/mixtral-8x22b-instruct-v0.1", + "service": "OpenRouter" + } +} diff --git a/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl b/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl new file mode 100644 index 0000000..d6e568a --- /dev/null +++ b/custom_routers/fusion_gate/eval/fixtures/routing_data.jsonl @@ -0,0 +1,28 @@ +{"task_name": "gsm8k", "model_name": "qwen2.5-7b-instruct", "performance": 0.55} +{"task_name": "gsm8k", "model_name": "llama-3.1-8b-instruct", "performance": 0.50} +{"task_name": "gsm8k", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.35} +{"task_name": "gsm8k", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.78} +{"task_name": "gsm8k", "model_name": "llama3-70b-instruct", "performance": 0.70} +{"task_name": "gsm8k", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.45} +{"task_name": "gsm8k", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.82} +{"task_name": "math", "model_name": "qwen2.5-7b-instruct", "performance": 0.40} +{"task_name": "math", "model_name": "llama-3.1-8b-instruct", "performance": 0.45} +{"task_name": "math", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.30} +{"task_name": "math", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.72} +{"task_name": "math", "model_name": "llama3-70b-instruct", "performance": 0.66} +{"task_name": "math", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.38} +{"task_name": "math", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.75} +{"task_name": "gpqa-reasoning", "model_name": "qwen2.5-7b-instruct", "performance": 0.30} +{"task_name": "gpqa-reasoning", "model_name": "llama-3.1-8b-instruct", "performance": 0.42} +{"task_name": "gpqa-reasoning", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.25} +{"task_name": "gpqa-reasoning", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.70} +{"task_name": "gpqa-reasoning", "model_name": "llama3-70b-instruct", "performance": 0.60} +{"task_name": "gpqa-reasoning", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.35} +{"task_name": "gpqa-reasoning", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.74} +{"task_name": "mbpp-code", "model_name": "qwen2.5-7b-instruct", "performance": 0.48} +{"task_name": "mbpp-code", "model_name": "llama-3.1-8b-instruct", "performance": 0.58} +{"task_name": "mbpp-code", "model_name": "mistral-7b-instruct-v0.3", "performance": 0.52} +{"task_name": "mbpp-code", "model_name": "llama-3.3-nemotron-super-49b-v1", "performance": 0.76} +{"task_name": "mbpp-code", "model_name": "llama3-70b-instruct", "performance": 0.72} +{"task_name": "mbpp-code", "model_name": "mixtral-8x7b-instruct-v0.1", "performance": 0.50} +{"task_name": "mbpp-code", "model_name": "mixtral-8x22b-instruct-v0.1", "performance": 0.80} diff --git a/custom_routers/fusion_gate/eval/retrain.py b/custom_routers/fusion_gate/eval/retrain.py new file mode 100644 index 0000000..e0e2a42 --- /dev/null +++ b/custom_routers/fusion_gate/eval/retrain.py @@ -0,0 +1,464 @@ +"""retrain — scripted, repeatable gate + capability refit from fusion logs (UMB-126). + +Closes the loop: logged fusion calls (``fusion_log`` JSONL format, i.e. each line +``{ts, strategy, query, panel, judge, responses[], analysis, token, cost}``) are +fed back into the ``api_calling_evaluation`` training-row format, the routing +table is AUGMENTED with the per-model performance those responses imply, and the +gate + capability scorer are REFIT on the augmented data. We then re-measure M3 +(gate-precision) BEFORE vs AFTER and report the delta. + +Pipeline (offline, deterministic, zero spend): + + 1. Load logged fusion responses (``fusion_log`` JSONL). Each ``responses[]`` + entry is decomposed via the same shape ``fusion_log.to_training_rows`` + emits: ``{query, model_name, response, performance, ...}``. + 2. Grade each decomposed response against the hard-slice ground truth + (offline exact-match; the live path would use + ``api_calling_evaluation.eval_perf``) to fill ``performance``. + 3. Build routing rows ``{task_name, model_name, performance}`` from the graded + responses and AUGMENT the base routing table with them. + 4. Refit: a fresh :class:`CapabilityScorer` over the augmented routing data + (capability scores), and re-tune the gate threshold from the augmented + difficulty/quality signal (gate refit). + 5. Re-run the fusion-gate arm with the refit components and report M3 before + vs after as a delta in the report. + +``--mock`` path: synthesizes a fusion log from the bundled fixtures (so there is +something to replay with zero spend) and runs the full before/after measurement. +Live path: point ``--log`` at a real ``fusion_log`` sink produced by keyed +``FusionGateRouter.fuse`` calls; same code path, real responses. + +This module never imports torch/pandas and makes no network call. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from .eval_harness import ( + FUSION_TIERS, + EvalHarness, + MockFusionExecutor, + RouteGate, + best_single_answer, + load_jsonl, + load_llm_candidates, + score_answer, + _FIXTURES_DIR, +) + + +# --------------------------------------------------------------------------- +# Step 1–2: replay fusion log into graded training rows +# --------------------------------------------------------------------------- + + +def synthesize_fusion_log( + dataset: list[dict[str, Any]], + llm_data: dict[str, Any], + routing_data: list[dict[str, Any]] | None, + *, + k: int, + judge: str | None, + threshold: float = 0.5, + budget_threshold: float | None = 0.3, +) -> list[dict[str, Any]]: + """Produce a fusion-log-shaped list by running the mock harness's fuse path. + + This gives the retrain loop something to replay offline with zero spend, in + exactly the ``fusion_log`` JSONL shape a live run would persist. Only queries + the gate escalates are logged (mirroring the real fuse-only logging). + """ + records_by_query = {r["query"]: r for r in dataset} + executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query) + harness = EvalHarness( + dataset=dataset, llm_data=llm_data, executor=executor, + routing_data=routing_data, threshold=threshold, + budget_threshold=budget_threshold, k=k, judge=judge, + ) + log: list[dict[str, Any]] = [] + for record in dataset: + query = record["query"] + decision = harness.gate.decide({"query": query}) + if decision.tier not in FUSION_TIERS: + continue + panel = harness._select_panel(query, decision.tier) + result = executor.run(query, panel, judge=judge) + log.append( + { + "strategy": "fusion", + "query": query, + "panel": list(result.panel), + "judge": result.judge, + "responses": [ + {"model": r.get("model"), "content": r.get("content")} + for r in result.responses + ], + "analysis": result.analysis, + "token": None, + "cost": result.cost, + } + ) + return log + + +def grade_log_to_training_rows( + log: list[dict[str, Any]], + dataset: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """Decompose ``responses[]`` to graded training rows (api_calling_evaluation shape). + + Mirrors ``fusion_log.to_training_rows`` ({query, model_name, response, + performance, ...}) but FILLS ``performance`` by grading each response against + the matching dataset record's ground truth (offline exact-match). The live + path would grade via ``api_calling_evaluation.eval_perf``. + """ + gt_by_query = {r["query"]: r for r in dataset} + rows: list[dict[str, Any]] = [] + for entry in log: + query = entry.get("query", "") + record = gt_by_query.get(query, {}) + gt = record.get("ground_truth") + task_name = record.get("task_name") + for resp in entry.get("responses", []) or []: + model = resp.get("model") + content = resp.get("content") + perf = score_answer(content, gt) if gt is not None else None + rows.append( + { + "query": query, + "task_name": task_name, + "model_name": model, + "model": model, + "response": content, + "performance": perf, + "strategy": "fusion", + "judge": entry.get("judge"), + } + ) + return rows + + +# --------------------------------------------------------------------------- +# Step 3: augment the routing table +# --------------------------------------------------------------------------- + + +def augment_routing_data( + base_routing: list[dict[str, Any]] | None, + training_rows: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """Append graded (task_name, model_name, performance) rows to the base table. + + Rows missing a model or performance are skipped. The result is consumable by + :class:`CapabilityScorer` (it keys on ``task_name`` / ``model_name`` / + ``performance``), so a fresh scorer over this list is the capability refit. + """ + augmented: list[dict[str, Any]] = list(base_routing or []) + for row in training_rows: + if row.get("model_name") is None or row.get("performance") is None: + continue + augmented.append( + { + "task_name": row.get("task_name") or "fusion", + "model_name": row["model_name"], + "performance": float(row["performance"]), + } + ) + return augmented + + +# --------------------------------------------------------------------------- +# Step 4: refit the gate threshold +# --------------------------------------------------------------------------- + + +def refit_gate_thresholds( + dataset: list[dict[str, Any]], + training_rows: list[dict[str, Any]], + *, + current_threshold: float, + current_budget_threshold: float | None, +) -> tuple[float, float | None]: + """Re-tune the gate thresholds from the augmented quality signal. + + Heuristic, deterministic refit examining the logged fusion responses: + + * Where fusion was logged but the BEST single answer was already correct, + the escalation was WASTED — raise the lower (``budget_threshold``) floor + so those low-difficulty queries route single next time, lifting M3 + precision (fewer non-improving escalations). + * The upper ``threshold`` is nudged by how reliably fusion BEAT the best + single answer overall: more help → lower it (escalate more), less → + raise it. Bounded so the refit can never disable the gate. + + Returns ``(threshold, budget_threshold)``. The live path could fit a learned + ``DifficultyEstimator`` here instead; this offline refit stays torch-free and + reproducible. + """ + gt_by_query = {r["query"]: r for r in dataset} + by_query: dict[str, list[dict[str, Any]]] = {} + for row in training_rows: + by_query.setdefault(row["query"], []).append(row) + + helped = 0 + wasted = 0 + total = 0 + wasted_difficulties: list[float] = [] + # A throwaway gate purely to score difficulty consistently with the harness. + diff_gate = RouteGate(llm_data={"_": {}}) + for query, rows in by_query.items(): + record = gt_by_query.get(query) + if record is None: + continue + gt = record.get("ground_truth") + if gt is None: + continue + total += 1 + fusion_best = max((r.get("performance") or 0.0) for r in rows) if rows else 0.0 + single_correct = score_answer(best_single_answer(record), gt) >= 1.0 + if fusion_best >= 1.0 and not single_correct: + helped += 1 + elif single_correct: + # Fusion was logged but a single model already had it: wasted spend. + wasted += 1 + wasted_difficulties.append(diff_gate._lexical_difficulty(query)) + + if total == 0: + return current_threshold, current_budget_threshold + + help_rate = helped / total + delta = (0.3 - help_rate) * 0.4 + threshold = max(0.1, min(0.9, current_threshold + delta)) + + # Raise the budget floor just above the hardest WASTED escalation so those + # low-value queries route single, without crossing the upper threshold. + budget_threshold = current_budget_threshold + if wasted_difficulties: + floor = max(wasted_difficulties) + 1e-3 + base = current_budget_threshold if current_budget_threshold is not None else 0.0 + budget_threshold = min(threshold, max(base, floor)) + + return threshold, budget_threshold + + +# --------------------------------------------------------------------------- +# Step 5: before/after M3 measurement +# --------------------------------------------------------------------------- + + +def measure_m3( + dataset: list[dict[str, Any]], + llm_data: dict[str, Any], + routing_data: list[dict[str, Any]] | None, + *, + threshold: float, + budget_threshold: float | None, + k: int, + judge: str | None, +) -> tuple[float | None, int, int]: + """Run the fusion-gate arm and return (gate_precision, n_escalated, n_improved).""" + records_by_query = {r["query"]: r for r in dataset} + executor = MockFusionExecutor(llm_data=llm_data, records_by_query=records_by_query) + harness = EvalHarness( + dataset=dataset, llm_data=llm_data, executor=executor, + routing_data=routing_data, threshold=threshold, + budget_threshold=budget_threshold, k=k, judge=judge, + ) + arm = harness.run_fusion_gate() + return arm.gate_precision, arm.n_escalated, arm.n_escalated_improved + + +def run_retrain( + dataset: list[dict[str, Any]], + llm_data: dict[str, Any], + base_routing: list[dict[str, Any]] | None, + *, + log: list[dict[str, Any]], + k: int, + judge: str | None, + threshold: float = 0.5, + budget_threshold: float | None = 0.3, +) -> dict[str, Any]: + """Full retrain loop. Returns a structured before/after result dict.""" + # BEFORE: measure M3 on the base routing data + base threshold. + before_m3, before_esc, before_imp = measure_m3( + dataset, llm_data, base_routing, + threshold=threshold, budget_threshold=budget_threshold, k=k, judge=judge, + ) + + # Replay the log -> graded rows -> augmented routing + refit thresholds. + training_rows = grade_log_to_training_rows(log, dataset) + augmented_routing = augment_routing_data(base_routing, training_rows) + refit_threshold, refit_budget_threshold = refit_gate_thresholds( + dataset, training_rows, + current_threshold=threshold, current_budget_threshold=budget_threshold, + ) + + # AFTER: measure M3 with the refit capability data + refit thresholds. + after_m3, after_esc, after_imp = measure_m3( + dataset, llm_data, augmented_routing, + threshold=refit_threshold, budget_threshold=refit_budget_threshold, + k=k, judge=judge, + ) + + delta = None + if before_m3 is not None and after_m3 is not None: + delta = after_m3 - before_m3 + + return { + "n_log_entries": len(log), + "n_training_rows": len(training_rows), + "n_base_routing_rows": len(base_routing or []), + "n_augmented_routing_rows": len(augmented_routing), + "threshold_before": threshold, + "threshold_after": refit_threshold, + "budget_threshold_before": budget_threshold, + "budget_threshold_after": refit_budget_threshold, + "m3_before": before_m3, + "m3_after": after_m3, + "m3_delta": delta, + "escalated_before": before_esc, + "escalated_after": after_esc, + "improved_before": before_imp, + "improved_after": after_imp, + } + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + + +def _fmt(value: Any, spec: str = ".4f") -> str: + return "n/a" if value is None else format(value, spec) + + +def render_retrain_block(result: dict[str, Any], *, mock: bool) -> str: + """Render the retrain delta as a markdown block appended to results.md.""" + src = "MOCK fixtures (synthesized fusion log, zero spend)" if mock else "LIVE fusion log" + lines = [ + "## Retrain (UMB-126): M3 before vs after", + "", + f"- Source: {src}", + f"- Replayed {result['n_log_entries']} fusion-log entries → " + f"{result['n_training_rows']} graded training rows.", + f"- Routing table augmented: {result['n_base_routing_rows']} → " + f"{result['n_augmented_routing_rows']} rows.", + f"- Gate threshold refit: {_fmt(result['threshold_before'], '.3f')} → " + f"{_fmt(result['threshold_after'], '.3f')}.", + f"- Gate budget_threshold refit: {_fmt(result.get('budget_threshold_before'), '.3f')} → " + f"{_fmt(result.get('budget_threshold_after'), '.3f')} " + "(raised so wasted low-difficulty escalations route single).", + "", + "| Metric | Before | After | Delta |", + "|--------|--------|-------|-------|", + f"| M3 gate-precision | {_fmt(result['m3_before'])} | {_fmt(result['m3_after'])} | " + f"{_fmt(result['m3_delta'], '+.4f')} |", + f"| Escalated | {result['escalated_before']} | {result['escalated_after']} | " + f"{result['escalated_after'] - result['escalated_before']:+d} |", + f"| Escalated-and-improved | {result['improved_before']} | {result['improved_after']} | " + f"{result['improved_after'] - result['improved_before']:+d} |", + ] + if mock: + lines.append("") + lines.append( + "> Retrain numbers are from MOCK fixtures; the real M3 delta (M4) " + "requires a keyed live run replaying a real fusion-log sink." + ) + return "\n".join(lines) + + +def mock_retrain_report_block( + *, + dataset: list[dict[str, Any]], + llm_path: str | Path, + routing_path: str | Path | None, + k: int, + judge: str | None, +) -> str: + """Convenience used by eval_harness --with-retrain: run mock retrain, return md block.""" + llm_data = load_llm_candidates(llm_path) + base_routing = ( + load_jsonl(routing_path) + if routing_path and Path(routing_path).exists() + else None + ) + # Deliberately LOOSE before-thresholds so the mock before-state over-escalates + # (escalates easy queries the best single model already solves). The refit then + # raises the budget floor, removing those wasted escalations and lifting M3 — + # demonstrating the loop produces a real, positive before→after delta offline. + before_threshold = 0.4 + before_budget_threshold = 0.1 + log = synthesize_fusion_log( + dataset, llm_data, base_routing, k=k, judge=judge, + threshold=before_threshold, budget_threshold=before_budget_threshold, + ) + result = run_retrain( + dataset, llm_data, base_routing, log=log, k=k, judge=judge, + threshold=before_threshold, budget_threshold=before_budget_threshold, + ) + return render_retrain_block(result, mock=True) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--mock", action="store_true", default=True, + help="Offline mock mode (default; synthesizes a log from fixtures, zero spend).") + parser.add_argument("--dataset", default=str(_FIXTURES_DIR / "hard_slice.jsonl")) + parser.add_argument("--llm", default=str(_FIXTURES_DIR / "llm_candidates.json")) + parser.add_argument("--routing", default=str(_FIXTURES_DIR / "routing_data.jsonl")) + parser.add_argument("--log", default=None, + help="Path to a fusion_log JSONL sink to replay. " + "When omitted in --mock, a log is synthesized from fixtures.") + parser.add_argument("--out", default=str(Path(__file__).resolve().parent / "out")) + parser.add_argument("--k", type=int, default=2, + help="Fusion panel size (default 2, matching eval_harness).") + parser.add_argument("--judge", default=None) + # Loose before-thresholds by default so the offline demonstration shows a + # real before→after M3 lift (the loose gate over-escalates easy queries; + # the refit raises the budget floor to remove that wasted spend). Tighten + # these to match a live config when replaying a real fusion-log sink. + parser.add_argument("--threshold", type=float, default=0.4) + parser.add_argument("--budget-threshold", type=float, default=0.1) + args = parser.parse_args(argv) + + dataset = load_jsonl(args.dataset) + llm_data = load_llm_candidates(args.llm) + base_routing = load_jsonl(args.routing) if Path(args.routing).exists() else None + + if args.log and Path(args.log).exists(): + log = load_jsonl(args.log) + else: + log = synthesize_fusion_log( + dataset, llm_data, base_routing, k=args.k, judge=args.judge, + threshold=args.threshold, budget_threshold=args.budget_threshold, + ) + + result = run_retrain( + dataset, llm_data, base_routing, log=log, k=args.k, judge=args.judge, + threshold=args.threshold, budget_threshold=args.budget_threshold, + ) + block = render_retrain_block(result, mock=True) + + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "retrain.md").write_text(block + "\n", encoding="utf-8") + (out_dir / "retrain.json").write_text(json.dumps(result, indent=2), encoding="utf-8") + + print(f"Wrote {out_dir / 'retrain.md'}") + print(f"M3 before={_fmt(result['m3_before'])} after={_fmt(result['m3_after'])} " + f"delta={_fmt(result['m3_delta'], '+.4f')}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/custom_routers/fusion_gate/executor.py b/custom_routers/fusion_gate/executor.py new file mode 100644 index 0000000..5cd98a4 --- /dev/null +++ b/custom_routers/fusion_gate/executor.py @@ -0,0 +1,429 @@ +"""FusionExecutor — isolates the OpenRouter `openrouter:fusion` call (UMB-120). + +SCAFFOLD ONLY. This is the single blast point for the beta server-tool API: +all OpenRouter-specific request/response handling lives here and nowhere else, +so upstream changes touch one file. UMB-120 implements `run`; UMB-128 may add a +local fan-out path behind the same interface. + +OpenRouter call shape (for the implementer): + POST {api_endpoint or https://openrouter.ai/api/v1}/chat/completions + body: { + "model": , + "messages": [{"role": "user", "content": query}], + "tools": [{"type": "openrouter:fusion", + "parameters": {"analysis_models": panel, "model": judge}}], + "tool_choice": "required" # gate already decided to fuse + } +Result tool payload: { status, analysis?, responses: [{model, content}, ...] } + - judge may fail → status "ok" with `analysis` omitted; fall back to writing + the answer from `responses[]`. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field +from typing import Any + +DEFAULT_ENDPOINT = "https://openrouter.ai/api/v1" + +# Provider key used to resolve the OpenRouter credential from an API_KEYS dict. +OPENROUTER_PROVIDER = "OpenRouter" + +# Default per-completion output-token estimate used by project_cost when no +# explicit completion-token count is supplied. Overridable via the +# ``est_completion_tokens`` hparam. +DEFAULT_EST_COMPLETION_TOKENS = 512 + +# Roughly four characters per token — the standard heuristic for estimating +# prompt token count from raw query text. +_CHARS_PER_TOKEN = 4 + +# OpenRouter server-tool identifier (BETA). Confined to this module. +FUSION_TOOL_TYPE = "openrouter:fusion" + + +class CostCeilingExceeded(Exception): + """Raised when the projected fusion cost exceeds the configured ceiling. + + Carries the projected per-query DOLLAR cost and the ceiling (also in dollars) + so callers can log/report the abort without re-projecting. Raised BEFORE any + HTTP call is made. + """ + + def __init__(self, projected: float, ceiling: float): + self.projected = projected + self.ceiling = ceiling + super().__init__( + f"Projected fusion cost ${projected:.6f} exceeds cost_ceiling " + f"${ceiling:.6f} per query; aborting before the OpenRouter call." + ) + + +class FusionExecutorError(Exception): + """Raised on an unrecoverable OpenRouter fusion response (transport/parse).""" + + +@dataclass +class FusionResult: + """Parsed output of a fusion call. + + answer : final synthesized answer (judge output, or fallback from panel) + analysis : structured analysis JSON (consensus/contradictions/blind_spots), + or None when the judge failed + responses : raw per-model responses [{"model", "content"}] — the training + signal consumed by the log sink (UMB-125) + panel : panel actually used + judge : judge model actually used + cost : total cost (sum of panel completions + judge) when available + raw : the untouched provider payload, for debugging + """ + + answer: str = "" + analysis: dict[str, Any] | None = None + responses: list[dict[str, Any]] = field(default_factory=list) + panel: list[str] = field(default_factory=list) + judge: str | None = None + cost: float | None = None + raw: dict[str, Any] | None = None + + +class FusionExecutor: + def __init__( + self, + llm_data: dict[str, Any], + judge: str | None = None, + panel_preset: str = "Quality", + cost_ceiling: float | None = None, + api_endpoint: str | None = None, + est_completion_tokens: int = DEFAULT_EST_COMPLETION_TOKENS, + ): + self.llm_data = llm_data + self.judge = judge + self.panel_preset = panel_preset + self.cost_ceiling = cost_ceiling + self.api_endpoint = api_endpoint or DEFAULT_ENDPOINT + self.est_completion_tokens = int(est_completion_tokens) + + def run( + self, + query: str, + panel: list[str], + judge: str | None = None, + api_keys: dict[str, str] | None = None, + **gen_kwargs: Any, + ) -> FusionResult: + """Execute one fusion call against the OpenRouter `openrouter:fusion` tool. + + A SINGLE POST to ``{api_endpoint}/chat/completions`` carries the panel as + the tool's ``analysis_models`` and the judge as the tool's ``model``, with + ``tool_choice="required"`` so the gate's fuse decision is honored. + + Args: + query: The user query to fuse over. + panel: Panel model slugs (-> tool ``analysis_models``). + judge: Judge model slug (-> tool ``model``); falls back to the + executor's configured judge, then to the outer model when unset. + api_keys: Optional ``{"OpenRouter": ""}`` provider dict. When + absent, the key is resolved from the ``OPENROUTER_API_KEY`` env + var or an ``API_KEYS`` JSON env var. + **gen_kwargs: Extra generation params merged into the request body + (e.g. ``temperature``, ``max_tokens``). + + Returns: + FusionResult with parsed ``responses``/``analysis``. On judge failure + (status ``ok`` with ``analysis`` omitted) the answer is synthesized + from ``responses`` and ``analysis`` is ``None``. + + Raises: + CostCeilingExceeded: when the projected cost exceeds ``cost_ceiling`` + (raised before any network call). + FusionExecutorError: on transport failure or an unparseable payload. + """ + judge = judge or self.judge + + # Cost guard: abort BEFORE the HTTP call so a too-expensive fusion never + # reaches the network. + if self.cost_ceiling is not None: + projected = self.project_cost(panel, judge, query=query) + if projected > self.cost_ceiling: + raise CostCeilingExceeded(projected, self.cost_ceiling) + + api_key = self._resolve_api_key(api_keys) + + body = self._build_request_body(query, panel, judge, gen_kwargs) + payload = self._post_chat_completions(body, api_key) + return self._parse_payload(payload, panel, judge) + + # ------------------------------------------------------- OpenRouter (BETA) + # Everything below this line is OpenRouter-specific request/response handling + # and MUST stay confined to this module (the beta server-tool blast point). + + def _resolve_api_key(self, api_keys: dict[str, str] | None) -> str: + """Resolve the OpenRouter key without logging it. + + Resolution order: + 1. ``api_keys["OpenRouter"]`` (explicit provider dict), + 2. ``OPENROUTER_API_KEY`` env var, + 3. ``API_KEYS`` env var parsed as a JSON ``{"OpenRouter": "..."}`` dict. + + The key value is never logged or echoed. + """ + if api_keys: + key = api_keys.get(OPENROUTER_PROVIDER) + if key: + return key + + env_key = os.environ.get("OPENROUTER_API_KEY") + if env_key: + return env_key + + raw = os.environ.get("API_KEYS") + if raw: + try: + parsed = json.loads(raw) + except (ValueError, TypeError) as exc: + raise FusionExecutorError( + "API_KEYS env var is not valid JSON; cannot resolve the " + f"{OPENROUTER_PROVIDER} key." + ) from exc + key = parsed.get(OPENROUTER_PROVIDER) if isinstance(parsed, dict) else None + if key: + return key + + raise FusionExecutorError( + f"No {OPENROUTER_PROVIDER} API key found. Provide api_keys=" + f'{{"{OPENROUTER_PROVIDER}": "..."}}, set OPENROUTER_API_KEY, or set ' + "API_KEYS as a JSON object." + ) + + def _build_request_body( + self, + query: str, + panel: list[str], + judge: str | None, + gen_kwargs: dict[str, Any], + ) -> dict[str, Any]: + """Build the chat/completions body carrying the openrouter:fusion tool. + + The outer ``model`` defaults to the judge slug when one is configured, + falling back to the panel head; the tool's ``model`` (judge) defaults to + the outer model when unset, matching the scaffold contract. + + Raises: + ValueError: when ``panel`` is empty. A fusion call has no meaning + without at least one analysis model, and an empty ``model`` field + would produce a nonsensical OpenRouter request. + """ + if not panel: + raise ValueError("panel must be non-empty for a fusion call") + outer_model = judge or panel[0] + parameters: dict[str, Any] = {"analysis_models": list(panel)} + if judge: + parameters["model"] = judge + + body: dict[str, Any] = { + "model": outer_model, + "messages": [{"role": "user", "content": query}], + "tools": [{"type": FUSION_TOOL_TYPE, "parameters": parameters}], + "tool_choice": "required", + } + # Allow callers to pass through generation params without overriding the + # fusion-defining keys above. + for key, value in gen_kwargs.items(): + if key not in body: + body[key] = value + return body + + def _post_chat_completions( + self, body: dict[str, Any], api_key: str + ) -> dict[str, Any]: + """POST the request and return the decoded JSON payload. + + Prefers ``requests`` when importable; otherwise uses stdlib ``urllib``. + The Authorization header carries the key but is never logged. + """ + url = f"{self.api_endpoint}/chat/completions" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + try: + import requests # type: ignore + except ImportError: + return self._post_urllib(url, headers, body) + + try: + resp = requests.post(url, headers=headers, json=body, timeout=120) + resp.raise_for_status() + return resp.json() + except Exception as exc: # noqa: BLE001 - normalize transport/HTTP errors + # Surface the HTTP status (e.g. 429 / 503) when present so callers can + # distinguish a retryable rate-limit/outage from a hard transport + # failure. The status code carries no secret; the key/headers/body are + # never included in the message. + status = getattr(getattr(exc, "response", None), "status_code", None) + detail = f" (HTTP {status})" if status is not None else "" + raise FusionExecutorError( + f"OpenRouter fusion request failed: {type(exc).__name__}{detail}" + ) from exc + + def _post_urllib( + self, url: str, headers: dict[str, str], body: dict[str, Any] + ) -> dict[str, Any]: + """stdlib fallback transport for the chat/completions POST.""" + import urllib.error + import urllib.request + + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request(url, data=data, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=120) as resp: # noqa: S310 + raw = resp.read().decode("utf-8") + return json.loads(raw) + except (urllib.error.URLError, ValueError) as exc: + # Mirror the requests path: a urllib HTTPError carries ``.code`` (the + # HTTP status); surface it so 429/503 are recoverable from the message. + # No secret is included (only the status integer). + status = getattr(exc, "code", None) + detail = f" (HTTP {status})" if status is not None else "" + raise FusionExecutorError( + f"OpenRouter fusion request failed: {type(exc).__name__}{detail}" + ) from exc + + def _parse_payload( + self, payload: dict[str, Any], panel: list[str], judge: str | None + ) -> FusionResult: + """Parse the OpenRouter fusion tool payload into a FusionResult. + + The tool result is shaped ``{status, analysis?, responses: [...]}``. The + ``responses[]`` entries are normalized to ``{"model", "content"}``. When + the judge fails (status ``ok`` with ``analysis`` omitted) the answer is + synthesized from the panel responses and ``analysis`` is ``None``. + """ + tool = self._extract_tool_result(payload) + + responses: list[dict[str, Any]] = [] + for item in tool.get("responses", []) or []: + if isinstance(item, dict): + responses.append( + {"model": item.get("model"), "content": item.get("content", "")} + ) + + raw_analysis = tool.get("analysis") + analysis: dict[str, Any] | None = None + answer = tool.get("answer", "") + if isinstance(raw_analysis, dict): + analysis = { + "consensus": raw_analysis.get("consensus"), + "contradictions": raw_analysis.get("contradictions"), + "blind_spots": raw_analysis.get("blind_spots"), + } + if not answer: + answer = raw_analysis.get("consensus") or "" + else: + # Judge-failure mode: status "ok" but analysis omitted. Synthesize an + # answer from the panel responses; do not crash. + answer = self._synthesize_answer(responses) + + cost = tool.get("cost", payload.get("cost")) + cost_value = float(cost) if isinstance(cost, (int, float)) else None + + return FusionResult( + answer=answer or "", + analysis=analysis, + responses=responses, + panel=list(panel), + judge=judge, + cost=cost_value, + raw=payload, + ) + + def _extract_tool_result(self, payload: dict[str, Any]) -> dict[str, Any]: + """Locate the fusion tool result inside the chat/completions payload. + + Accepts either a top-level tool result (``{status, responses, ...}``) or + the tool result nested in the first choice's message tool_calls. + """ + if isinstance(payload, dict) and "responses" in payload: + return payload + + choices = payload.get("choices") if isinstance(payload, dict) else None + if choices: + message = choices[0].get("message", {}) if isinstance(choices[0], dict) else {} + tool_calls = message.get("tool_calls") or [] + for call in tool_calls: + if not isinstance(call, dict): + continue + result = call.get("result") + if isinstance(result, dict): + return result + func = call.get("function", {}) + args = func.get("arguments") if isinstance(func, dict) else None + if isinstance(args, str): + try: + parsed = json.loads(args) + except ValueError: + continue + if isinstance(parsed, dict): + return parsed + elif isinstance(args, dict): + return args + + raise FusionExecutorError( + "OpenRouter fusion payload contained no parseable tool result." + ) + + @staticmethod + def _synthesize_answer(responses: list[dict[str, Any]]) -> str: + """Build a fallback answer from panel responses when the judge fails.""" + parts = [ + str(r.get("content", "")).strip() + for r in responses + if str(r.get("content", "")).strip() + ] + return "\n\n".join(parts) + + def project_cost( + self, + panel: list[str], + judge: str | None, + query: str | None = None, + prompt_tokens: int | None = None, + ) -> float: + """Estimate the per-query DOLLAR cost of the panel + judge for the cost guard. + + DOLLARS: the returned value is an estimated per-query dollar cost, NOT a + relative unit-price proxy. ``input_price`` / ``output_price`` in + ``llm_data`` are per-million-token prices, so for each panel member plus + the judge:: + + dollars += (input_price * prompt_tokens + + output_price * completion_tokens) / 1e6 + + ``prompt_tokens`` is taken from the explicit argument when given, else + estimated from ``query`` as ``max(1, len(query) // 4)`` (~4 chars/token), + else falls back to ``est_completion_tokens`` when neither is available. + ``completion_tokens`` is the config-driven ``est_completion_tokens`` + default. The ``cost_ceiling`` comparison in both ``route_single`` and + ``run`` is made against this dollar projection, so operators set + ``cost_ceiling`` in dollars per query. + """ + if prompt_tokens is not None: + prompt_toks = max(1, int(prompt_tokens)) + elif query is not None: + prompt_toks = max(1, len(query) // _CHARS_PER_TOKEN) + else: + prompt_toks = self.est_completion_tokens + completion_toks = self.est_completion_tokens + + members = list(panel) + ([judge] if judge else []) + total = 0.0 + for name in members: + info = self.llm_data.get(name, {}) + input_price = float(info.get("input_price", 0.0)) + output_price = float(info.get("output_price", 0.0)) + total += (input_price * prompt_toks + output_price * completion_toks) / 1e6 + return total diff --git a/custom_routers/fusion_gate/fusion_log.py b/custom_routers/fusion_gate/fusion_log.py new file mode 100644 index 0000000..e1c9fbc --- /dev/null +++ b/custom_routers/fusion_gate/fusion_log.py @@ -0,0 +1,204 @@ +"""Fusion log sink — structured JSONL logging for fusion calls (UMB-125). + +The fusion path produces a panel of model responses plus a judge synthesis. That +output is the training signal for FusionFactory-style routing data: each panel +member is a (query, model, response, performance) observation. This module turns +a :class:`~custom_routers.fusion_gate.executor.FusionResult` into two things: + + * ``log_fusion`` — one append-only structured JSONL line per fusion call, + capturing the decision context (query, panel, judge, raw responses, + analysis, token, cost) for audit and offline replay. + * ``to_training_rows`` — per-model rows decomposed from ``responses[]``, shaped + to be consumed by ``llmrouter/data/api_calling_evaluation.py`` (which keys on + ``query`` / ``model_name`` / ``response`` / ``performance``). + +Secrets hygiene: this sink NEVER serializes the untouched provider payload +(``FusionResult.raw``) and NEVER writes API keys, auth headers, or PII. Only the +explicitly enumerated fields below are emitted; everything else is dropped. + +Default sink path mirrors the OpenClaw memory bank: +``~/.llmrouter/openclaw_memory.jsonl`` (override via ``sink_path``). + +See: fusion-gate-router-prd-v0.2.0.md, openclaw_router/memory.py, +llmrouter/data/api_calling_evaluation.py. +""" + +from __future__ import annotations + +import json +import os +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .executor import FusionResult + +# Default JSONL sink, shared with the OpenClaw memory bank layout. +DEFAULT_SINK_PATH = str(Path.home() / ".llmrouter" / "openclaw_memory.jsonl") + +# Exact (case-insensitive) key names that mark a mapping entry as +# credential-bearing. Any matching key is dropped before serialization, at any +# nesting depth. +# +# Exact-match (not substring) is deliberate: substring matching on "token" / +# "auth" / "session" silently drops legitimate fields like ``prompt_tokens``, +# ``completion_tokens``, ``author``, ``authentication_method``, and +# ``session_id`` that may appear in usage/tracing metadata or multi-turn +# response structures. Actual inline credentials in free text are caught by +# ``_INLINE_SECRET_RE`` instead, which is the right tool for that job. +_SECRET_KEYS = frozenset( + { + "api_key", + "apikey", + "authorization", + "bearer", + "secret", + "password", + "passwd", + "credential", + "cookie", + } +) + +# Inline credential shapes to scrub from free text (e.g. accidental leakage in a +# model response). Conservative: redact obvious key formats, not arbitrary text. +_INLINE_SECRET_RE = re.compile( + r"\b(sk-[A-Za-z0-9_\-]{12,}|Bearer\s+[A-Za-z0-9._\-]{12,})", + re.IGNORECASE, +) + +_REDACTED = "[REDACTED]" + + +def _is_secret_key(key: str) -> bool: + """Return True when a mapping key is a known credential-bearing key name.""" + return key.lower() in _SECRET_KEYS + + +def _scrub(value: Any) -> Any: + """Recursively drop secret-keyed entries and redact inline credentials. + + Mappings: keys whose name is in :data:`_SECRET_KEYS` are removed entirely. + Strings: inline key/bearer shapes are replaced with ``[REDACTED]``. + Other scalars and containers are walked structurally. + """ + if isinstance(value, dict): + return { + str(k): _scrub(v) + for k, v in value.items() + if not _is_secret_key(str(k)) + } + if isinstance(value, (list, tuple)): + return [_scrub(v) for v in value] + if isinstance(value, str): + return _INLINE_SECRET_RE.sub(_REDACTED, value) + return value + + +def _scrub_response(resp: dict[str, Any]) -> dict[str, Any]: + """Normalize one panel response to ``{"model", "content"}``, scrubbed. + + Tolerates the executor's ``{"model", "content"}`` shape while dropping any + extra credential-bearing fields a provider payload might carry. + """ + safe = _scrub(resp) if isinstance(resp, dict) else {} + return { + "model": safe.get("model"), + "content": safe.get("content"), + } + + +def _utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _resolve_path(sink_path: str | None) -> Path: + """Resolve the sink path, expanding ``~`` and environment variables.""" + raw = (sink_path or "").strip() or DEFAULT_SINK_PATH + return Path(os.path.expanduser(os.path.expandvars(raw))) + + +def log_fusion( + result: FusionResult, + query: str, + sink_path: str | None = None, + token: int | None = None, + cost: float | None = None, +) -> Path: + """Append one structured JSONL entry describing a fusion call. + + Args: + result: Parsed fusion output (panel responses, analysis, judge, cost). + query: The user query that triggered the fusion call. + sink_path: Target JSONL file. Defaults to + ``~/.llmrouter/openclaw_memory.jsonl``. ``~`` / env vars are expanded. + token: Total token count for the call, when known. Falls back to None. + cost: Total cost for the call. Falls back to ``result.cost`` when None. + + Returns: + The resolved :class:`~pathlib.Path` the entry was appended to. + + Notes: + The provider's raw payload (``result.raw``) is intentionally NOT written. + All emitted fields are scrubbed for credential-bearing keys and inline + secret shapes; no API keys, auth headers, or PII are persisted. + """ + path = _resolve_path(sink_path) + + record = { + "ts": _utc_now_iso(), + "strategy": "fusion", + "query": query, + "panel": list(result.panel), + "judge": result.judge, + "responses": [_scrub_response(r) for r in result.responses], + "analysis": _scrub(result.analysis) if result.analysis is not None else None, + "token": token, + "cost": cost if cost is not None else result.cost, + } + + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + return path + + +def to_training_rows(result: FusionResult, query: str) -> list[dict[str, Any]]: + """Decompose ``responses[]`` into per-model FusionFactory training rows. + + Each panel response becomes one row keyed to match the schema produced by + ``llmrouter/data/api_calling_evaluation.py``: ``query`` / ``model_name`` / + ``response`` / ``performance``. The ``model`` alias is included alongside + ``model_name`` so the rows also satisfy the OpenClaw memory layout, which + keys on ``query`` / ``model``. + + Args: + result: Parsed fusion output containing the panel ``responses[]``. + query: The user query that produced the responses. + + Returns: + One dict per panel response. ``performance`` defaults to ``None`` because + fusion responses are not graded at log time; an offline evaluator fills + it in. Content is scrubbed of inline secrets. + + Notes: + No API keys, auth headers, or PII are emitted. + """ + rows: list[dict[str, Any]] = [] + for resp in result.responses: + safe = _scrub_response(resp) + model = safe.get("model") + rows.append( + { + "query": query, + "model_name": model, + "model": model, + "response": safe.get("content"), + "performance": None, + "strategy": "fusion", + "judge": result.judge, + } + ) + return rows diff --git a/custom_routers/fusion_gate/gate.py b/custom_routers/fusion_gate/gate.py new file mode 100644 index 0000000..c019dbb --- /dev/null +++ b/custom_routers/fusion_gate/gate.py @@ -0,0 +1,328 @@ +"""RouteGate — the single-vs-fusion decision (UMB-119). + +The gate decides, per query, whether to take the cheap SINGLE-model path or +escalate to the FUSION path. The decision is driven by two scalars in [0, 1]: + + difficulty — how hard the query is (higher => more likely to fuse) + confidence — how sure the gate is in its single-vs-fusion call + +Difficulty estimation follows LLMRouter's ``ThresholdRouter`` +(``custom_routers/thresholdrouter/router.py``) two-mode design: + + 1. Injected estimator (preferred). When the caller supplies a query embedding + via ``query_input['embedding']`` AND an estimator is wired in, the gate + defers to the learned ``DifficultyEstimator``. The estimator is duck-typed + (any callable ``embedding -> score``) so this module needs no torch import + and stays unit-testable with no trained model present. + + 2. Lexical fallback (always available). A deterministic, documented heuristic + over the raw query text — length, code/math markers, multi-part questions. + This guarantees the gate runs end-to-end with no embedding and no model. + +The estimator and the lexical heuristic are kept as separate methods so each is +independently unit-testable. ``GateDecision`` is extended additively only. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Any, Callable, Literal + +# Three-tier dial (UMB-124). The middle tier escalates a mid-difficulty query to +# a *cheap* Budget fusion panel; the top tier escalates the hardest queries to +# the full Quality fusion panel. ``"fusion"`` is retained as an alias-free +# explicit Quality tier so existing callers/tests that branch on +# ``tier == "fusion"`` keep working. Extension is additive: a new value is added +# between the existing ones, not a rename. +Tier = Literal["single", "budget_fusion", "fusion"] + +# Tiers that take the FUSION path (panel + judge) rather than single routing. +FUSION_TIERS: frozenset[str] = frozenset({"budget_fusion", "fusion"}) + +# Maps a fusion tier to the panel preset used when capability data is missing. +# Only the mid tier has a fixed preset here: ``budget_fusion`` always falls back +# to the cheap Budget panel. The top ``fusion`` tier is intentionally absent so +# ``TIER_TO_PRESET.get(tier, self.panel_preset)`` resolves it to the router's +# configured ``panel_preset`` (Quality by default) — a hardcoded "fusion": +# "Quality" entry here would be dead data, always overridden by panel_preset. +TIER_TO_PRESET: dict[str, str] = { + "budget_fusion": "Budget", +} + + +def resolve_preset(tier: str, default_preset: str) -> str: + """Resolve the panel preset for ``tier``, falling back to ``default_preset``. + + Single source of truth for the tier->preset mapping shared by + ``FusionGateRouter._select_panel`` and the eval harness, so the two cannot + silently diverge. The mid ``budget_fusion`` tier maps to the cheap Budget + panel via ``TIER_TO_PRESET``; every other tier (notably the top ``fusion`` + tier, deliberately absent from ``TIER_TO_PRESET``) resolves to the caller's + configured ``default_preset`` (typically ``panel_preset``, Quality by + default). + + Args: + tier: The gate-decided tier (e.g. ``"budget_fusion"`` / ``"fusion"``). + default_preset: Preset to use when ``tier`` has no fixed mapping. + + Returns: + The preset name (e.g. ``"Budget"`` / ``"Quality"``). + """ + return TIER_TO_PRESET.get(tier, default_preset) + +# --- lexical heuristic tuning constants (documented, deterministic) ---------- +# Difficulty is a weighted blend of independent signals, each normalized to +# [0, 1], then clamped. Weights sum to 1.0 so difficulty stays in [0, 1]. +_LENGTH_SATURATION_CHARS = 400.0 # query length at which the length signal hits 1.0 +_LENGTH_WEIGHT = 0.40 +_CODE_MATH_WEIGHT = 0.35 +_MULTIPART_WEIGHT = 0.25 +_MULTIPART_SATURATION = 3.0 # number of sub-questions at which the signal hits 1.0 + +# Markers that indicate code or math content (case-insensitive substring match +# for words; symbol matches are literal). Deliberately conservative. +_CODE_MATH_KEYWORDS = ( + "code", + "function", + "algorithm", + "compile", + "debug", + "regex", + "integral", + "derivative", + "theorem", + "proof", + "equation", + "matrix", + "complexity", +) +_CODE_MATH_SYMBOLS = ("```", "def ", "class ", "{", "}", ";", "=>", "==", "->", "\\", "^", "∫", "∑", "√") + + +@dataclass +class GateDecision: + """Result of the gate. + + tier : "single" or "fusion" + model_name : chosen model when tier == "single" (None for fusion) + panel : optional pre-selected panel for fusion (UMB-123 may fill this); + when empty, the router/executor fall back to the preset + difficulty : estimated difficulty in [0, 1] (for logging / threshold tuning) + confidence : router confidence in [0, 1] + """ + + tier: Tier + model_name: str | None = None + panel: list[str] = field(default_factory=list) + difficulty: float = 0.0 + confidence: float = 1.0 + + +class RouteGate: + """Route-vs-fuse gate. + + Args: + llm_data: name -> candidate-metadata mapping (from default_llm.json). + threshold: difficulty cutoff to escalate single -> fusion. Sourced from + the router YAML and injected by ``FusionGateRouter``; never hardcoded + here beyond a permissive default for standalone construction. + estimator: optional learned difficulty estimator. Any callable mapping a + query embedding to a difficulty score in [0, 1]. Duck-typed so no + torch dependency is introduced; a scalar is extracted via ``.item()`` + when the return value exposes it (e.g. a 0-d tensor). + """ + + def __init__( + self, + llm_data: dict[str, Any], + threshold: float = 0.5, + estimator: Callable[[Any], Any] | None = None, + budget_threshold: float | None = None, + ): + self.llm_data = llm_data + self.llm_names = list(llm_data.keys()) + self.threshold = threshold + self.estimator = estimator + # Three-tier dial (UMB-124). ``budget_threshold`` is the LOWER boundary: + # difficulty < budget_threshold -> single + # budget_threshold <= difficulty < threshold -> budget_fusion (cheap) + # difficulty >= threshold -> fusion (Quality) + # When ``budget_threshold`` is None (or >= threshold) the middle tier is + # disabled and the gate degrades to the original two-tier single/fusion + # behavior, so existing two-threshold-free configs are unaffected. + if budget_threshold is None or budget_threshold >= threshold: + self.budget_threshold = threshold + else: + self.budget_threshold = budget_threshold + + def decide(self, query_input: dict) -> GateDecision: + """Decide single-vs-fusion for one query. + + Rules (three-tier dial, UMB-124): + - ``high_stakes`` forces the full Quality ``fusion`` tier regardless of + difficulty (max confidence in the fusion call, since the caller has + overridden the gate). + - otherwise estimate difficulty (injected estimator if an embedding is + present, else the lexical fallback) and place it against the two + thresholds: + * difficulty >= ``threshold`` => ``fusion`` (Quality panel) + * ``budget_threshold`` <= difficulty => ``budget_fusion`` (cheap) + * difficulty < ``budget_threshold`` => ``single`` (cheapest model) + - confidence is derived from the margin to ``threshold`` (see + ``_confidence``). When the middle tier is disabled + (``budget_threshold == threshold``) this collapses to single/fusion. + """ + if query_input.get("high_stakes"): + # Caller override: fuse, and report difficulty if we can still compute + # it for logging, but the decision itself is forced with full confidence. + difficulty = self._difficulty(query_input) + return GateDecision( + tier="fusion", + difficulty=difficulty, + confidence=1.0, + ) + + difficulty = self._difficulty(query_input) + confidence = self._confidence(difficulty) + + if difficulty >= self.threshold: + return GateDecision( + tier="fusion", + difficulty=difficulty, + confidence=confidence, + ) + + if difficulty >= self.budget_threshold: + return GateDecision( + tier="budget_fusion", + difficulty=difficulty, + confidence=confidence, + ) + + return GateDecision( + tier="single", + model_name=self._cheapest_model(), + difficulty=difficulty, + confidence=confidence, + ) + + # ----------------------------------------------------------- difficulty + + def _difficulty(self, query_input: dict) -> float: + """Estimate difficulty in [0, 1] for a query. + + Prefers the injected estimator when both an embedding and an estimator + are available; otherwise falls back to the deterministic lexical + heuristic over the raw query text. + """ + embedding = query_input.get("embedding") + if embedding is not None and self.estimator is not None: + return self._estimate_with_model(embedding) + return self._lexical_difficulty(query_input.get("query", "")) + + def _estimate_with_model(self, embedding: Any) -> float: + """Run the injected estimator and coerce its output to a clamped float. + + The estimator is duck-typed: any callable returning either a Python float + or an object exposing ``.item()`` (e.g. a 0-d / 1-element tensor). This + mirrors ``ThresholdRouter._estimate_difficulty`` without importing torch. + """ + score = self.estimator(embedding) + if hasattr(score, "item"): + score = score.item() + return self._clamp(float(score)) + + def _lexical_difficulty(self, query: str) -> float: + """Deterministic lexical difficulty heuristic (no model required). + + Blends three normalized signals: + - length : ``len(query) / 400`` clamped to 1.0. + - code/math : 1.0 if any code/math keyword or symbol is present, + else 0.0. + - multi-part : count of sub-questions (``?`` plus enumerated/"and"-joined + clauses) normalized by ``_MULTIPART_SATURATION``. + + The blend is a fixed-weight convex combination, so the result is always + in [0, 1] and fully reproducible. Kept pure (text in, float out) for + unit testing. + """ + if not query: + return 0.0 + + length_signal = self._clamp(len(query) / _LENGTH_SATURATION_CHARS) + code_math_signal = 1.0 if self._has_code_or_math(query) else 0.0 + multipart_signal = self._clamp(self._count_subquestions(query) / _MULTIPART_SATURATION) + + difficulty = ( + _LENGTH_WEIGHT * length_signal + + _CODE_MATH_WEIGHT * code_math_signal + + _MULTIPART_WEIGHT * multipart_signal + ) + return self._clamp(difficulty) + + @staticmethod + def _has_code_or_math(query: str) -> bool: + """True if the query contains a code/math keyword or symbol.""" + lowered = query.lower() + if any(keyword in lowered for keyword in _CODE_MATH_KEYWORDS): + return True + return any(symbol in query for symbol in _CODE_MATH_SYMBOLS) + + @staticmethod + def _count_subquestions(query: str) -> int: + """Count distinct sub-questions / parts in a query. + + Heuristic: the larger of (a) the number of '?' characters and + (b) 1 + the number of enumerated parts or coordinating " and "/" ; " + separators. A single simple question therefore counts as 1. + """ + question_marks = query.count("?") + # Enumerated parts: "1.", "2)", "- ", or coordinating separators. + enumerations = len(re.findall(r"(?:\b\d+[.)]\s)|(?:\s;\s)|(?:\sand\s)", query)) + parts = max(question_marks, 1 + enumerations) + return parts + + # ----------------------------------------------------------- confidence + + def _confidence(self, difficulty: float) -> float: + """Derive confidence in [0, 1] from the margin to the threshold. + + Intuition: the gate is most confident when difficulty sits far from the + threshold (clearly easy or clearly hard) and least confident right at the + boundary. We normalize the absolute margin by the larger side of the + threshold split so both an easy and a hard query can reach full + confidence at the extremes. + """ + margin = abs(difficulty - self.threshold) + span = max(self.threshold, 1.0 - self.threshold) + if span <= 0.0: + return 1.0 + return self._clamp(margin / span) + + # ----------------------------------------------------------- selection + + def cheapest_model(self) -> str: + """Pick the lowest-cost candidate as the single-path default (public API). + + Public entry point for callers outside this class (the router's downgrade + guard, the eval harness). Delegates to the private implementation. + """ + return self._cheapest_model() + + def _cheapest_model(self) -> str: + """Pick the lowest-cost candidate as the single-path default.""" + + def cost(name: str) -> float: + info = self.llm_data.get(name, {}) + # default_llm.json uses input_price / output_price; fall back gracefully + return float(info.get("input_price", 0.0)) + float(info.get("output_price", 0.0)) + + return min(self.llm_names, key=cost) if self.llm_names else "" + + # ----------------------------------------------------------- utilities + + @staticmethod + def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: + """Clamp ``value`` into the closed interval [low, high].""" + return max(low, min(high, value)) diff --git a/custom_routers/fusion_gate/router.py b/custom_routers/fusion_gate/router.py new file mode 100644 index 0000000..75f5fe6 --- /dev/null +++ b/custom_routers/fusion_gate/router.py @@ -0,0 +1,270 @@ +"""FusionGateRouter — route-vs-fuse meta-router for LLMRouter. + +Integration entry point (UMB-118/121/123/124). Implements the MetaRouter +contract (``route_single`` / ``route_batch``) and owns: + + - UMB-121: reading and respecting all six config keys (``threshold``, ``k``, + ``judge``, ``provider``/``base_url``, ``panel_preset``, ``cost_ceiling``), + plus a spend-free ``--route-only`` path and the cost guard. + - UMB-123: capability-scored panel selection via ``CapabilityScorer``, with a + preset (Quality / Budget) fallback when capability data is unavailable. + - UMB-124: a three-tier dial — ``single`` -> ``budget_fusion`` (cheap panel) + -> ``fusion`` (full Quality panel) — threaded from the gate through to panel + selection. + +The router's only job is to DECIDE, per query, between: + - the cheap SINGLE-model path (classic LLMRouter routing), + - a cheap BUDGET-fusion panel, or + - the full QUALITY-fusion panel (OpenRouter ``openrouter:fusion`` server tool). + +Routing never spends: ``route_single`` returns a decision dict (no API call). +Spend happens only in ``fuse()``, which is invoked separately and logs every +call via ``fusion_log.log_fusion``. + +See: fusion-gate-router-prd-v0.2.0.md +""" + +from __future__ import annotations + +import sys +from typing import Any + +# torch is a GENUINE transitive requirement of this module: MetaRouter subclasses +# torch.nn.Module, so importing FusionGateRouter requires torch even though the +# gate itself runs no inference. This import is intentionally eager (not lazy) so +# the dependency fails loudly at import time rather than mid-route. Torch-free +# unit tests therefore load gate.py / executor.py / fusion_log.py by file path, +# never through this module — see custom_routers/fusion_gate/tests/. +import torch.nn as nn + +from llmrouter.models.meta_router import MetaRouter + +from .capability import CapabilityScorer +from .executor import CostCeilingExceeded, FusionExecutor, FusionResult +from .fusion_log import log_fusion +from .gate import FUSION_TIERS, GateDecision, RouteGate, resolve_preset + + +class FusionGateRouter(MetaRouter): + """Meta-router that gates each query between single routing and fusion tiers. + + Decision contract returned by ``route_single``: + + single path → {"query", "strategy": "single", "tier": "single", + "model_name", "predicted_llm", "difficulty", "confidence"} + fusion path → {"query", "strategy": "fusion", "tier": "budget_fusion"|"fusion", + "panel": [...], "judge": ..., "model_name", "predicted_llm", + "difficulty", "confidence", "projected_cost"} + + Both shapes carry ``strategy`` and ``tier`` so downstream code can branch, and + both carry ``model_name`` for drop-in compatibility with the CLI's + ``route_query`` (which keys on ``model_name`` / ``predicted_llm``). The fusion + path's ``model_name`` is the judge (or panel head) purely as a label — no API + call is made during routing. + """ + + def __init__(self, yaml_path: str): + # MetaRouter manages config/LLM-candidate loading. A simple gate needs no + # trainable model, so Identity stands in until a learned gate lands. + model = nn.Identity() + super().__init__(model=model, yaml_path=yaml_path) + + # Available candidate LLMs (name -> metadata dict from default_llm.json). + self.llm_names: list[str] = list(self.llm_data.keys()) + + # ------------------------------------------------------ config (UMB-121) + # All six config keys are read here and respected downstream. `.get` + # keeps construction robust if a key is omitted. + hparam: dict[str, Any] = self.cfg.get("hparam", {}) or {} + self.threshold: float = float(hparam.get("threshold", 0.5)) + self.k: int = int(hparam.get("k", 3)) + self.judge: str | None = hparam.get("judge") + self.panel_preset: str = hparam.get("panel_preset", "Quality") + cost_ceiling = hparam.get("cost_ceiling") + self.cost_ceiling: float | None = ( + float(cost_ceiling) if cost_ceiling is not None else None + ) + # Per-completion output-token estimate feeding the dollar cost projection. + self.est_completion_tokens: int = int(hparam.get("est_completion_tokens", 512)) + # Three-tier dial (UMB-124): optional lower boundary for the middle tier. + budget_threshold = hparam.get("budget_threshold") + self.budget_threshold: float | None = ( + float(budget_threshold) if budget_threshold is not None else None + ) + + # Provider / base_url (UMB-121). ``base_url`` is the OpenRouter endpoint + # for the beta server tool; ``provider`` is informational and resolved by + # the executor for key lookup. ``base_url`` takes precedence over the + # legacy top-level ``api_endpoint``. + self.provider: str | None = hparam.get("provider") or self.cfg.get("provider") + self.base_url: str | None = ( + hparam.get("base_url") + or self.cfg.get("base_url") + or self.cfg.get("api_endpoint") + ) + + # Optional JSONL sink for fusion logging (defaults inside fusion_log). + self.log_sink_path: str | None = hparam.get("log_sink_path") + + # ----------------------------------------------------------- seams + self.gate = RouteGate( + llm_data=self.llm_data, + threshold=self.threshold, + budget_threshold=self.budget_threshold, + ) + # Capability scorer (UMB-123) sources per-model performance from the + # routing data the DataLoader attached (DataFrame or None). Prefer the + # train split (richer); fall back to test split. + routing_data = getattr(self, "routing_data_train", None) + if routing_data is None: + routing_data = getattr(self, "routing_data_test", None) + self.capability = CapabilityScorer( + llm_data=self.llm_data, + routing_data=routing_data, + ) + self.executor = FusionExecutor( + llm_data=self.llm_data, + judge=self.judge, + panel_preset=self.panel_preset, + cost_ceiling=self.cost_ceiling, + api_endpoint=self.base_url, + est_completion_tokens=self.est_completion_tokens, + ) + + # ------------------------------------------------------------------ routing + + def route_single(self, query_input: dict) -> dict: + """Route one query: decide tier, then select the panel if fusing. + + SPEND-FREE: this only computes a decision. No OpenRouter call is made + here, so ``--route-only`` (and the normal CLI route step) never spend. + For the fusion tiers the intended panel/judge and a projected cost are + included so callers can audit the plan before invoking ``fuse()``. + """ + query = query_input["query"] + + decision: GateDecision = self.gate.decide(query_input) + + if decision.tier not in FUSION_TIERS: + return { + "query": query, + "strategy": "single", + "tier": decision.tier, + "model_name": decision.model_name, + "predicted_llm": decision.model_name, + "difficulty": decision.difficulty, + "confidence": decision.confidence, + } + + # Fusion tier: select the panel (UMB-123/124). The judge is the + # config-driven slug (None => the executor uses the outer model). + panel = self._select_panel(query_input, decision) + judge = self.judge + + # Cost guard (UMB-121): when the projected Σ(panel)+judge exceeds the + # ceiling, abort fusion by DOWNGRADING to the cheap single path rather + # than spending. The downgrade is reported via ``tier``/``downgraded``. + projected = self.executor.project_cost(panel, judge, query=query) + if self.cost_ceiling is not None and projected > self.cost_ceiling: + fallback_model = self.gate.cheapest_model() + return { + "query": query, + "strategy": "single", + "tier": "single", + "downgraded_from": decision.tier, + "model_name": fallback_model, + "predicted_llm": fallback_model, + "difficulty": decision.difficulty, + "confidence": decision.confidence, + "projected_cost": projected, + "cost_ceiling": self.cost_ceiling, + } + + return { + "query": query, + "strategy": "fusion", + "tier": decision.tier, + "panel": panel, + "judge": judge, + # Label only (the judge/outer model); no API call is made in routing. + "model_name": judge or (panel[0] if panel else None), + "predicted_llm": judge or (panel[0] if panel else None), + "difficulty": decision.difficulty, + "confidence": decision.confidence, + "projected_cost": projected, + } + + def route_batch(self, batch: list) -> list: + """Route multiple queries.""" + return [self.route_single(q) for q in batch] + + # ----------------------------------------------------------------- internals + + def _select_panel(self, query_input: dict, decision: GateDecision) -> list[str]: + """Pick the fusion panel (maps to the tool's ``analysis_models``). + + UMB-123/124: capability-scored, tier-aware selection. + - The capability scorer (UMB-123) scores candidates for the query's + category and returns the top-k; panel membership therefore varies by + query type (code/math/reasoning vs general). + - The tier (UMB-124) selects the fallback preset when capability data + is unavailable: ``budget_fusion`` -> Budget, ``fusion`` -> Quality. + - A panel pre-selected on the gate decision wins, if present. + """ + if decision.panel: + return decision.panel + + query = query_input.get("query", "") + panel = self.capability.select_panel(query, self.k) + if panel: + return panel + + # Fallback: capability data unavailable for this query -> preset panel. + # The tier dictates the preset: the mid ``budget_fusion`` tier maps to the + # cheap Budget panel; any other tier (the top ``fusion`` tier) resolves to + # the router's configured ``panel_preset``. Resolution is delegated to + # ``gate.resolve_preset`` — the single source of truth shared with the + # eval harness so the two cannot silently diverge. + preset = resolve_preset(decision.tier, self.panel_preset) + return self.capability.preset_panel(preset, self.k) + + # --------------------------------------------------------------- execution + + def fuse(self, route_result: dict, **gen_kwargs: Any) -> FusionResult: + """Execute a fusion decision via the FusionExecutor (UMB-120). + + Kept separate from ``route_single`` so ``--route-only`` (UMB-121) can + return the decision without ever calling this. Every fusion call is + logged via ``fusion_log.log_fusion`` (UMB-125) — secret-scrubbed, + raw-payload-free. + + Raises: + ValueError: if called on a non-fusion route result. + CostCeilingExceeded: re-raised from the executor's pre-call guard. + """ + if route_result.get("strategy") != "fusion": + raise ValueError("fuse() called on a non-fusion route result") + + query = route_result["query"] + result = self.executor.run( + query=query, + panel=route_result["panel"], + judge=route_result.get("judge"), + **gen_kwargs, + ) + + # Log every fusion call (audit + training signal). A logging failure + # (disk-full, permission-denied, NFS timeout) must NEVER destroy the + # already-computed result, so the append is best-effort: any exception is + # swallowed (reported to stderr) and the result is returned regardless. + # The sink is append-only and secret-scrubbed inside fusion_log. + try: + log_fusion(result, query=query, sink_path=self.log_sink_path, cost=result.cost) + except Exception as exc: # noqa: BLE001 - logging must not lose the result + # Report the failure type only; never echo the path/secret-bearing detail. + print( + f"fusion_log: failed to persist fusion call ({type(exc).__name__}); " + "continuing without losing the result.", + file=sys.stderr, + ) + return result diff --git a/custom_routers/fusion_gate/tests/__init__.py b/custom_routers/fusion_gate/tests/__init__.py new file mode 100644 index 0000000..078ea56 --- /dev/null +++ b/custom_routers/fusion_gate/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the fusion_gate plugin.""" diff --git a/custom_routers/fusion_gate/tests/conftest.py b/custom_routers/fusion_gate/tests/conftest.py new file mode 100644 index 0000000..1d90e5d --- /dev/null +++ b/custom_routers/fusion_gate/tests/conftest.py @@ -0,0 +1,37 @@ +"""pytest bootstrap for the fusion_gate test suite. + +Makes ``pytest custom_routers/fusion_gate/tests/`` work out of the box, without +relying on the standalone ``python test_gate.py`` runner as a workaround. + +Two things are guaranteed here: + + 1. The repo root is on ``sys.path`` so ``custom_routers`` resolves as a package + (the torch-dependent ``test_router`` imports ``custom_routers.fusion_gate.router`` + directly). + 2. ``--import-mode=importlib`` is enabled so pytest does not rewrite ``sys.path`` + in ways that re-trigger package ``__init__`` collection. Combined with the + lazy ``__getattr__`` in ``fusion_gate/__init__.py``, the four torch-free test + modules (gate / executor / capability / fusion_log / eval_harness) collect and + run with no torch installed. +""" + +from __future__ import annotations + +import os +import sys + +# Repo root = three levels up from this file (tests/ -> fusion_gate/ -> +# custom_routers/ -> repo root). +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + + +def pytest_configure(config) -> None: + """Force importlib import mode so package collection stays torch-free. + + Setting it here (rather than only in pytest.ini) keeps the behavior local to + this plugin's tests and avoids editing any repo-level config outside + custom_routers/fusion_gate/. + """ + config.option.importmode = "importlib" diff --git a/custom_routers/fusion_gate/tests/test_capability.py b/custom_routers/fusion_gate/tests/test_capability.py new file mode 100644 index 0000000..fd1ec56 --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_capability.py @@ -0,0 +1,197 @@ +"""Offline unit tests for ``CapabilityScorer`` (UMB-123). + +Fully offline: no network, no torch, no trained model, and no large data files. +``capability.py`` is loaded directly by file path (like ``test_gate.py``) so the +package ``__init__`` — which pulls in ``router.py``/torch — is never imported. + +Coverage: + - panel membership VARIES by query type (code/math/reasoning vs general) when + backed by per-category routing performance + - top-k respected; k clamped against the candidate set + - preset fallback (Quality vs Budget) resolves by price + - ``select_panel`` returns None (-> preset fallback) when no capability data + and llm_data carries no usable prior + - task_name -> category bucketing +""" + +from __future__ import annotations + +import importlib.util +import os +import sys +from typing import Any + +_CAP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "capability.py")) +_spec = importlib.util.spec_from_file_location("fusion_gate_capability", _CAP_PATH) +assert _spec is not None and _spec.loader is not None +_cap_mod = importlib.util.module_from_spec(_spec) +sys.modules[_spec.name] = _cap_mod +_spec.loader.exec_module(_cap_mod) + +CapabilityScorer = _cap_mod.CapabilityScorer + +# Candidate set mirroring default_llm.json shape (size / feature / prices). +LLM_DATA: dict[str, dict[str, Any]] = { + "cheap-7b": { + "size": "7B", + "feature": "fast and efficient small model", + "input_price": 0.20, + "output_price": 0.20, + }, + "mid-49b": { + "size": "49B", + "feature": "powerful high-accuracy model for complex tasks", + "input_price": 0.90, + "output_price": 0.90, + }, + "big-141b": { + "size": "141B", + "feature": "advanced large-scale model with exceptional performance", + "input_price": 1.20, + "output_price": 1.20, + }, + "moe-45b": { + "size": "45B", + "feature": "mixture of experts optimized for creative generation", + "input_price": 0.60, + "output_price": 0.60, + }, +} + +# Routing rows that make different models best at different categories so the +# panel is forced to vary by query type. cheap-7b dominates "code", big-141b +# dominates "reasoning"/"math". +ROUTING_ROWS = [ + {"task_name": "humaneval-code", "model_name": "cheap-7b", "performance": 0.95}, + {"task_name": "humaneval-code", "model_name": "mid-49b", "performance": 0.30}, + {"task_name": "humaneval-code", "model_name": "big-141b", "performance": 0.20}, + {"task_name": "humaneval-code", "model_name": "moe-45b", "performance": 0.40}, + {"task_name": "agentverse-logicgrid", "model_name": "cheap-7b", "performance": 0.10}, + {"task_name": "agentverse-logicgrid", "model_name": "mid-49b", "performance": 0.50}, + {"task_name": "agentverse-logicgrid", "model_name": "big-141b", "performance": 0.98}, + {"task_name": "agentverse-logicgrid", "model_name": "moe-45b", "performance": 0.40}, +] + + +def _scorer(routing_data: Any = None) -> CapabilityScorer: + return CapabilityScorer(llm_data=LLM_DATA, routing_data=routing_data or ROUTING_ROWS) + + +# ----------------------------------------------------------- query classification + + +def test_classify_query_categories(): + s = _scorer() + assert s.classify_query("Write a python function to debug this code") == "code" + assert s.classify_query("Compute the integral and prove the theorem") == "math" + assert s.classify_query("Solve this logic puzzle step by step") == "reasoning" + assert s.classify_query("What is the capital of France?") == "general" + assert s.classify_query("") == "general" + + +# ------------------------------------------------------------- panel variation + + +def test_panel_varies_by_query_type(): + """A code query and a reasoning query must yield different panels.""" + s = _scorer() + code_panel = s.select_panel("Write a function to fix this bug in my code", k=2) + reasoning_panel = s.select_panel("Solve this logic puzzle, reason step by step", k=2) + + assert code_panel is not None and reasoning_panel is not None + # cheap-7b is best at code; big-141b is best at reasoning. + assert code_panel[0] == "cheap-7b" + assert reasoning_panel[0] == "big-141b" + assert code_panel != reasoning_panel + + +def test_top_k_respected_and_clamped(): + s = _scorer() + assert len(s.select_panel("debug this code", k=2)) == 2 + # k larger than candidate count returns all candidates, not an error. + full = s.select_panel("debug this code", k=99) + assert len(full) == len(LLM_DATA) + # k <= 0 -> None (preset fallback trigger). + assert s.select_panel("debug this code", k=0) is None + + +# ------------------------------------------------------------------ fallback + + +def test_select_panel_returns_none_without_any_capability_signal(): + """No routing data AND no llm_data prior => None (preset fallback).""" + s = CapabilityScorer(llm_data={}, routing_data=None) + assert s.select_panel("anything", k=3) is None + + +def test_preset_panel_quality_vs_budget_by_price(): + s = _scorer() + quality = s.preset_panel("Quality", k=2) + budget = s.preset_panel("Budget", k=2) + + # Quality favors most-capable (highest price proxy) first. + assert quality[0] == "big-141b" + # Budget favors cheapest first. + assert budget[0] == "cheap-7b" + assert quality != budget + + +def test_static_prior_used_when_routing_data_absent(): + """Without routing data, scoring still differentiates via the llm_data prior.""" + s = CapabilityScorer(llm_data=LLM_DATA, routing_data=None) + panel = s.select_panel("general knowledge question", k=2) + assert panel is not None + # Largest/most-capable model ranks first via the size/feature prior. + assert panel[0] == "big-141b" + + +# ---------------------------------------------------------- task bucketing + + +def test_task_name_to_category_bucketing(): + s = _scorer() + assert s._task_to_category("humaneval-code") == "code" + assert s._task_to_category("agentverse-logicgrid") == "reasoning" + assert s._task_to_category("gsm8k") == "math" + assert s._task_to_category("trivia-qa") == "general" + assert s._task_to_category(None) == "general" + + +def test_dataframe_like_routing_data_is_accepted(): + """A pandas-like object exposing to_dict(orient='records') is consumed.""" + + class _FakeDF: + def __init__(self, rows): + self._rows = rows + + def to_dict(self, orient="records"): # noqa: D401 - mirror pandas API + assert orient == "records" + return self._rows + + s = CapabilityScorer(llm_data=LLM_DATA, routing_data=_FakeDF(ROUTING_ROWS)) + code_panel = s.select_panel("debug this code", k=1) + assert code_panel == ["cheap-7b"] + + +# ----------------------------------------------------------------- runner + + +def _run_all() -> int: + tests = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + failures = 0 + for test in tests: + try: + test() + print(f"PASS {test.__name__}") + except AssertionError as exc: # pragma: no cover - reporting path + failures += 1 + print(f"FAIL {test.__name__}: {exc}") + except Exception as exc: # pragma: no cover - reporting path + failures += 1 + print(f"ERROR {test.__name__}: {type(exc).__name__}: {exc}") + print(f"\n{len(tests) - failures}/{len(tests)} passed") + return 1 if failures else 0 + + +if __name__ == "__main__": + raise SystemExit(_run_all()) diff --git a/custom_routers/fusion_gate/tests/test_eval_harness.py b/custom_routers/fusion_gate/tests/test_eval_harness.py new file mode 100644 index 0000000..15015dd --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_eval_harness.py @@ -0,0 +1,227 @@ +"""Fast offline tests for the eval + retrain harness (UMB-122/124/126). + +These run the MOCK harness end-to-end against the bundled fixtures — zero spend, +no network. The harness modules are imported by file path (like test_executor.py) +so importing them never triggers the package __init__ (which pulls in torch via +router.py); the harness itself is torch-free. + +Run: ``pytest custom_routers/fusion_gate/tests/test_eval_harness.py`` or, with no +pytest installed, ``python custom_routers/fusion_gate/tests/test_eval_harness.py``. +""" + +from __future__ import annotations + +import importlib.util +import os +import sys +import types +from pathlib import Path + +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +_EVAL_DIR = Path(_REPO_ROOT) / "custom_routers" / "fusion_gate" / "eval" +_FIXTURES = _EVAL_DIR / "fixtures" + + +def _ensure_namespace_packages() -> None: + """Register lightweight package stubs so the eval modules' relative imports + (``from .eval_harness import ...``) resolve WITHOUT executing + ``custom_routers/fusion_gate/__init__.py`` — which imports torch via + router.py. The harness is deliberately torch-free, so we route around it.""" + for pkg_name, pkg_dir in ( + ("custom_routers", Path(_REPO_ROOT) / "custom_routers"), + ("custom_routers.fusion_gate", _EVAL_DIR.parent), + ("custom_routers.fusion_gate.eval", _EVAL_DIR), + ): + if pkg_name not in sys.modules: + mod = types.ModuleType(pkg_name) + mod.__path__ = [str(pkg_dir)] # mark as a package + sys.modules[pkg_name] = mod + + +def _import_eval_module(name: str, filename: str): + """Import an eval module by path under its package alias.""" + full = f"custom_routers.fusion_gate.eval.{name}" + if full in sys.modules: + return sys.modules[full] + spec = importlib.util.spec_from_file_location(full, str(_EVAL_DIR / filename)) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[full] = module + spec.loader.exec_module(module) + return module + + +_ensure_namespace_packages() +eval_harness = _import_eval_module("eval_harness", "eval_harness.py") +retrain = _import_eval_module("retrain", "retrain.py") + +DATASET = str(_FIXTURES / "hard_slice.jsonl") +LLM = str(_FIXTURES / "llm_candidates.json") +ROUTING = str(_FIXTURES / "routing_data.jsonl") + + +def _build_harness(**overrides): + kwargs = dict( + dataset_path=DATASET, llm_path=LLM, routing_path=ROUTING, + threshold=0.5, budget_threshold=0.3, k=2, judge=None, panel_preset="Quality", + ) + kwargs.update(overrides) + return eval_harness.build_mock_harness(**kwargs) + + +# --------------------------------------------------------- harness end-to-end + + +def test_harness_runs_all_three_arms_offline(): + harness, dataset, _ = _build_harness() + arms = harness.run_all() + + assert set(arms) == {"always_route", "always_fuse", "fusion_gate"} + for name, r in arms.items(): + assert r.n == len(dataset) + assert 0.0 <= r.quality <= 1.0 + assert r.blended_cost >= 0.0 + assert 0.0 <= r.escalation_p <= 1.0 + + # always_route never escalates; always_fuse always escalates. + assert arms["always_route"].escalation_p == 0.0 + assert arms["always_fuse"].escalation_p == 1.0 + + +def test_blended_cost_ordering(): + """Fusion costs more per query than a single model; gate sits in between or below fuse.""" + harness, _, _ = _build_harness() + arms = harness.run_all() + assert arms["always_route"].blended_cost < arms["always_fuse"].blended_cost + assert arms["fusion_gate"].blended_cost <= arms["always_fuse"].blended_cost + + +def test_m1_m2_m3_verdicts_present_and_pass_on_fixtures(): + harness, dataset, dataset_path = _build_harness() + arms = harness.run_all() + verdicts = eval_harness.compute_verdicts(arms) + + # M1: gate quality >= 95% of always-fuse quality. + assert verdicts["M1"]["pass"], verdicts["M1"] + # M2: blended cost <= 1.6x always-route. + assert verdicts["M2"]["pass"], verdicts["M2"] + # M3: gate-precision is measured (escalated queries that beat best single). + assert verdicts["M3"]["gate_precision"] is not None + assert verdicts["M3"]["n_escalated"] >= 1 + + +def test_gate_precision_counts_only_improvements(): + """M3 counts an escalation only when fusion is right AND best single is wrong.""" + harness, dataset, _ = _build_harness() + arm = harness.run_fusion_gate() + # On the fixtures, fusion is designed to be correct where the majority single + # answer is wrong on at least some escalated queries. + assert arm.n_escalated_improved >= 1 + assert arm.n_escalated_improved <= arm.n_escalated + + +def test_report_and_csv_are_written(tmp_path): + rc = eval_harness.main([ + "--mock", "--dataset", DATASET, "--llm", LLM, "--routing", ROUTING, + "--out", str(tmp_path), "--with-retrain", + ]) + assert rc == 0 + csv_path = tmp_path / "results.csv" + md_path = tmp_path / "results.md" + assert csv_path.exists() + assert md_path.exists() + + md = md_path.read_text(encoding="utf-8") + # The report must explicitly flag mock provenance and the keyed live path. + assert "MOCK fixtures" in md + assert "keyed live run" in md.lower() + assert "M1" in md and "M2" in md and "M3" in md + # --with-retrain appends the retrain delta block. + assert "Retrain" in md and "before vs after" in md + + # CSV has a header + three arm rows. + rows = csv_path.read_text(encoding="utf-8").strip().splitlines() + assert len(rows) == 4 + assert rows[0].startswith("arm,") + + +def test_live_mode_is_blocked(): + """The offline harness refuses --live so a stray run cannot spend.""" + import pytest as _pytest # only used when pytest is present + + with _pytest.raises(SystemExit): + eval_harness.main(["--live", "--dataset", DATASET, "--llm", LLM]) + + +# --------------------------------------------------------- retrain loop + + +def test_retrain_measures_m3_before_and_after_offline(): + dataset = eval_harness.load_jsonl(DATASET) + llm_data = eval_harness.load_llm_candidates(LLM) + base_routing = eval_harness.load_jsonl(ROUTING) + + log = retrain.synthesize_fusion_log(dataset, llm_data, base_routing, k=3, judge=None) + assert log, "expected a non-empty synthesized fusion log" + # Log entries are in fusion_log shape. + assert all("responses" in e and "query" in e for e in log) + + result = retrain.run_retrain(dataset, llm_data, base_routing, log=log, k=3, judge=None) + + # Routing table is strictly augmented by the replayed responses. + assert result["n_augmented_routing_rows"] > result["n_base_routing_rows"] + # Both M3 measurements are produced and the delta is reported. + assert result["m3_before"] is not None + assert result["m3_after"] is not None + assert result["m3_delta"] == result["m3_after"] - result["m3_before"] + # Threshold is refit within bounds. + assert 0.1 <= result["threshold_after"] <= 0.9 + + +def test_retrain_block_renders_with_mock_flag(): + dataset = eval_harness.load_jsonl(DATASET) + block = retrain.mock_retrain_report_block( + dataset=dataset, llm_path=LLM, routing_path=ROUTING, k=3, judge=None + ) + assert "Retrain" in block + assert "MOCK fixtures" in block + assert "M3 gate-precision" in block + + +# --------------------------------------------------------- manual runner + + +def _run_all_manually() -> int: + """Run every test_* with no pytest (env without pytest installed).""" + import tempfile + + failures = 0 + for fn_name, fn in sorted(globals().items()): + if not fn_name.startswith("test_") or not callable(fn): + continue + try: + if fn_name == "test_report_and_csv_are_written": + with tempfile.TemporaryDirectory() as d: + fn(Path(d)) + elif fn_name == "test_live_mode_is_blocked": + # Reproduce the SystemExit assertion without pytest. + try: + eval_harness.main(["--live", "--dataset", DATASET, "--llm", LLM]) + except SystemExit: + pass + else: + raise AssertionError("--live should SystemExit") + else: + fn() + print(f"PASS {fn_name}") + except Exception as exc: # noqa: BLE001 + failures += 1 + print(f"FAIL {fn_name}: {type(exc).__name__}: {exc}") + return failures + + +if __name__ == "__main__": + raise SystemExit(1 if _run_all_manually() else 0) diff --git a/custom_routers/fusion_gate/tests/test_executor.py b/custom_routers/fusion_gate/tests/test_executor.py new file mode 100644 index 0000000..f74d17b --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_executor.py @@ -0,0 +1,228 @@ +"""Offline unit tests for FusionExecutor.run (UMB-120). + +All tests mock the HTTP layer — no live network/API calls are made. The HTTP +seam is patched at ``requests.post`` (the executor prefers ``requests`` when it +imports successfully), so the request body can be inspected and the response +faked. +""" + +from __future__ import annotations + +import importlib.util +import json +import os +import sys +from typing import Any + +import pytest + +# Import the executor module directly by file path so these tests stay offline +# and free of the package __init__ (which imports torch via router.py). The +# executor itself has no torch dependency. The module is registered in +# sys.modules before execution so its dataclasses resolve field types. +_EXECUTOR_PATH = os.path.join(os.path.dirname(__file__), "..", "executor.py") +_spec = importlib.util.spec_from_file_location("fusion_gate_executor", _EXECUTOR_PATH) +assert _spec is not None and _spec.loader is not None +_executor = importlib.util.module_from_spec(_spec) +sys.modules[_spec.name] = _executor +_spec.loader.exec_module(_executor) + +CostCeilingExceeded = _executor.CostCeilingExceeded +FusionExecutor = _executor.FusionExecutor +FusionResult = _executor.FusionResult + +PANEL = ["model-a", "model-b", "model-c"] +JUDGE = "judge-model" + +# Per-model unit prices mirroring default_llm.json's input_price/output_price. +LLM_DATA: dict[str, dict[str, Any]] = { + "model-a": {"input_price": 0.20, "output_price": 0.20}, + "model-b": {"input_price": 0.60, "output_price": 0.60}, + "model-c": {"input_price": 0.90, "output_price": 0.90}, + "judge-model": {"input_price": 1.20, "output_price": 1.20}, +} + +API_KEYS = {"OpenRouter": "sk-test-key"} + + +class _FakeResponse: + """Minimal stand-in for a requests.Response.""" + + def __init__(self, payload: dict[str, Any]): + self._payload = payload + + def raise_for_status(self) -> None: # noqa: D401 - mirror requests API + return None + + def json(self) -> dict[str, Any]: + return self._payload + + +def _make_executor(cost_ceiling: float | None = None) -> FusionExecutor: + return FusionExecutor( + llm_data=LLM_DATA, + judge=JUDGE, + cost_ceiling=cost_ceiling, + ) + + +def _patch_post(monkeypatch, payload: dict[str, Any], captured: dict[str, Any]): + """Patch requests.post to capture the body and return ``payload``.""" + + def fake_post(url, headers=None, json=None, timeout=None, **kwargs): # noqa: A002 + captured["url"] = url + captured["headers"] = headers + captured["body"] = json + captured["timeout"] = timeout + return _FakeResponse(payload) + + import requests + + monkeypatch.setattr(requests, "post", fake_post) + + +def test_happy_path_parses_responses_and_analysis(monkeypatch): + payload = { + "status": "ok", + "answer": "Fused answer.", + "analysis": { + "consensus": "All models agree X.", + "contradictions": ["b disagrees on Y"], + "blind_spots": ["none flagged Z"], + }, + "responses": [ + {"model": "model-a", "content": "answer from a"}, + {"model": "model-b", "content": "answer from b"}, + {"model": "model-c", "content": "answer from c"}, + ], + "cost": 1.23, + } + captured: dict[str, Any] = {} + _patch_post(monkeypatch, payload, captured) + + result = _make_executor().run("What is 2+2?", PANEL, api_keys=API_KEYS) + + assert isinstance(result, FusionResult) + assert result.answer == "Fused answer." + assert result.analysis == { + "consensus": "All models agree X.", + "contradictions": ["b disagrees on Y"], + "blind_spots": ["none flagged Z"], + } + assert [r["model"] for r in result.responses] == PANEL + assert result.responses[0]["content"] == "answer from a" + assert result.panel == PANEL + assert result.judge == JUDGE + assert result.cost == 1.23 + assert result.raw == payload + + +def test_request_body_uses_required_tool_choice_and_panel(monkeypatch): + payload = { + "status": "ok", + "answer": "ok", + "analysis": {"consensus": "c", "contradictions": [], "blind_spots": []}, + "responses": [{"model": "model-a", "content": "x"}], + } + captured: dict[str, Any] = {} + _patch_post(monkeypatch, payload, captured) + + _make_executor().run("q", PANEL, api_keys=API_KEYS) + + body = captured["body"] + assert body["tool_choice"] == "required" + assert body["messages"] == [{"role": "user", "content": "q"}] + + tool = body["tools"][0] + assert tool["type"] == "openrouter:fusion" + assert tool["parameters"]["analysis_models"] == PANEL + assert tool["parameters"]["model"] == JUDGE + + # The Authorization header carries the key but the body never does. + assert captured["headers"]["Authorization"] == "Bearer sk-test-key" + assert "sk-test-key" not in json.dumps(body) + + +def test_judge_failure_falls_back_without_crashing(monkeypatch): + # status "ok" but analysis omitted -> synthesize from responses[]. + payload = { + "status": "ok", + "responses": [ + {"model": "model-a", "content": "partial a"}, + {"model": "model-b", "content": "partial b"}, + ], + } + captured: dict[str, Any] = {} + _patch_post(monkeypatch, payload, captured) + + result = _make_executor().run("q", PANEL, api_keys=API_KEYS) + + assert result.analysis is None + assert result.answer == "partial a\n\npartial b" + assert [r["model"] for r in result.responses] == ["model-a", "model-b"] + + +def test_project_cost_is_per_query_dollars(): + """project_cost returns an estimated per-query DOLLAR cost, not a unit-price proxy. + + Prices in LLM_DATA are per-million-token. For each member, + (input_price*prompt_tokens + output_price*completion_tokens)/1e6, with + prompt_tokens estimated from the query (max(1, len(query)//4)) and + completion_tokens = est_completion_tokens (default 512). + """ + executor = _make_executor() + query = "x" * 400 # 400 chars -> ~100 prompt tokens + projected = executor.project_cost(PANEL, JUDGE, query=query) + + prompt_toks = max(1, len(query) // 4) # 100 + completion_toks = 512 + expected = 0.0 + for name in PANEL + [JUDGE]: + info = LLM_DATA[name] + expected += (info["input_price"] * prompt_toks + info["output_price"] * completion_toks) / 1e6 + + assert projected == pytest.approx(expected) + # Dollar-scale: a realistic per-query cost is well under a dollar here. + assert 0.0 < projected < 0.01 + + +def test_cost_ceiling_aborts_before_http_call(monkeypatch): + # Sentinel post that fails the test if the network layer is reached. + def boom(*args, **kwargs): + raise AssertionError("HTTP call must not happen when cost ceiling exceeded") + + import requests + + monkeypatch.setattr(requests, "post", boom) + + # A realistic per-query DOLLAR projection (~$0.0015 for this panel+judge) must + # trip a tight dollar ceiling. The ceiling is now interpreted as dollars/query. + executor = _make_executor(cost_ceiling=0.0005) + + with pytest.raises(CostCeilingExceeded) as exc: + executor.run("q", PANEL, judge=JUDGE, api_keys=API_KEYS) + + assert exc.value.ceiling == 0.0005 + assert exc.value.projected > 0.0005 + # Sanity: the projection is dollar-scale, not the old unit-price-sum proxy (~5.8). + assert exc.value.projected < 0.01 + + +def test_realistic_cost_ceiling_allows_when_under_cap(monkeypatch): + """A realistic $0.05/query ceiling does NOT abort this cheap panel.""" + payload = { + "status": "ok", + "answer": "ok", + "analysis": {"consensus": "c", "contradictions": [], "blind_spots": []}, + "responses": [{"model": "model-a", "content": "x"}], + } + captured: dict[str, Any] = {} + _patch_post(monkeypatch, payload, captured) + + executor = _make_executor(cost_ceiling=0.05) + result = executor.run("q", PANEL, judge=JUDGE, api_keys=API_KEYS) + assert result.answer == "ok" + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"])) diff --git a/custom_routers/fusion_gate/tests/test_fusion_log.py b/custom_routers/fusion_gate/tests/test_fusion_log.py new file mode 100644 index 0000000..24d9359 --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_fusion_log.py @@ -0,0 +1,209 @@ +"""Offline tests for the fusion log sink (UMB-125). + +Fully offline: no network, no torch, no real home directory. All writes go to a +per-test temp directory. The target modules are loaded by file path (not via the +package ``__init__``, which imports torch through router.py), so this suite runs +in a torch-free environment. Run with either: + + python -m pytest custom_routers/fusion_gate/tests/test_fusion_log.py + python custom_routers/fusion_gate/tests/test_fusion_log.py +""" + +from __future__ import annotations + +import importlib.util +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path + +# Load the target modules directly by file path so these tests stay offline and +# free of the package __init__ (which imports torch via router.py). Both modules +# are torch-free. Each is registered in sys.modules before execution so the +# relative ``from .executor import FusionResult`` inside fusion_log.py — and the +# dataclass field types in executor.py — resolve. This mirrors test_executor.py. +_PLUGIN_DIR = Path(__file__).resolve().parents[1] + + +def _load_module(name: str, filename: str): + spec = importlib.util.spec_from_file_location(name, os.path.join(str(_PLUGIN_DIR), filename)) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +# executor must be registered under the package-relative name that fusion_log's +# ``from .executor import FusionResult`` resolves to, so the dataclass identities +# match. fusion_log.py is loaded as a package submodule (package=fusion_gate_pkg) +# whose .executor points at the executor we just loaded. +_pkg = type(sys)("fusion_gate_pkg") +_pkg.__path__ = [str(_PLUGIN_DIR)] +sys.modules["fusion_gate_pkg"] = _pkg + +_executor = _load_module("fusion_gate_pkg.executor", "executor.py") +_fusion_log = _load_module("fusion_gate_pkg.fusion_log", "fusion_log.py") + +FusionResult = _executor.FusionResult +DEFAULT_SINK_PATH = _fusion_log.DEFAULT_SINK_PATH +log_fusion = _fusion_log.log_fusion +to_training_rows = _fusion_log.to_training_rows + + +def _sample_result() -> FusionResult: + """A representative fusion result with a 3-model panel and a judge.""" + return FusionResult( + answer="Synthesized final answer.", + analysis={ + "consensus": "All three models agree on the core claim.", + "contradictions": [], + "blind_spots": ["edge case X"], + }, + responses=[ + {"model": "qwen2.5-7b-instruct", "content": "Answer from qwen."}, + {"model": "llama-3.1-8b-instruct", "content": "Answer from llama."}, + {"model": "mistral-7b-instruct-v0.3", "content": "Answer from mistral."}, + ], + panel=[ + "qwen2.5-7b-instruct", + "llama-3.1-8b-instruct", + "mistral-7b-instruct-v0.3", + ], + judge="llama3-70b-instruct", + cost=0.0042, + raw={"id": "gen-123", "authorization": "Bearer sk-secretkey-should-never-leak"}, + ) + + +class TestLogFusion(unittest.TestCase): + def setUp(self) -> None: + self._tmp = tempfile.TemporaryDirectory() + self.tmp_path = Path(self._tmp.name) + self.sink = self.tmp_path / "nested" / "fusion_log.jsonl" + self.query = "Explain the tradeoffs of consensus algorithms." + + def tearDown(self) -> None: + self._tmp.cleanup() + + def test_entry_appended(self) -> None: + """One call writes exactly one JSONL line; a second call appends.""" + result = _sample_result() + + returned = log_fusion(result, self.query, sink_path=str(self.sink), token=512, cost=0.0042) + self.assertEqual(returned, self.sink) + self.assertTrue(self.sink.exists()) + + lines = self.sink.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + + log_fusion(result, self.query, sink_path=str(self.sink), token=256, cost=0.002) + lines = self.sink.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 2) + + def test_entry_required_fields(self) -> None: + """Logged entry carries the full structured schema.""" + log_fusion(_sample_result(), self.query, sink_path=str(self.sink), token=512, cost=0.0042) + entry = json.loads(self.sink.read_text(encoding="utf-8").splitlines()[0]) + + for field in ("ts", "strategy", "query", "panel", "judge", "responses", "analysis", "token", "cost"): + self.assertIn(field, entry, f"missing field: {field}") + + self.assertEqual(entry["strategy"], "fusion") + self.assertEqual(entry["query"], self.query) + self.assertEqual(entry["judge"], "llama3-70b-instruct") + self.assertEqual(entry["token"], 512) + self.assertEqual(entry["cost"], 0.0042) + self.assertEqual(len(entry["responses"]), 3) + self.assertEqual(len(entry["panel"]), 3) + self.assertIn("consensus", entry["analysis"]) + + def test_cost_falls_back_to_result(self) -> None: + """When cost arg is None, the entry uses result.cost.""" + log_fusion(_sample_result(), self.query, sink_path=str(self.sink)) + entry = json.loads(self.sink.read_text(encoding="utf-8").splitlines()[0]) + self.assertEqual(entry["cost"], 0.0042) + self.assertIsNone(entry["token"]) + + def test_no_key_leakage(self) -> None: + """Raw provider payload and credential shapes never reach the file.""" + log_fusion(_sample_result(), self.query, sink_path=str(self.sink), token=1) + text = self.sink.read_text(encoding="utf-8") + + self.assertNotIn("sk-secretkey-should-never-leak", text) + self.assertNotIn("authorization", text.lower()) + self.assertNotIn("bearer", text.lower()) + self.assertNotIn("gen-123", text) # raw payload is dropped entirely + + entry = json.loads(text.splitlines()[0]) + self.assertNotIn("raw", entry) + + def test_inline_secret_in_response_redacted(self) -> None: + """Inline credential shapes inside response content are redacted.""" + result = FusionResult( + responses=[ + {"model": "qwen2.5-7b-instruct", "content": "Use key sk-abcdef0123456789 to call it."}, + ], + panel=["qwen2.5-7b-instruct"], + judge=None, + ) + log_fusion(result, self.query, sink_path=str(self.sink)) + text = self.sink.read_text(encoding="utf-8") + self.assertNotIn("sk-abcdef0123456789", text) + self.assertIn("[REDACTED]", text) + + def test_default_sink_path_uses_llmrouter_home(self) -> None: + """The documented default lands under ~/.llmrouter (not asserted to disk).""" + self.assertTrue(DEFAULT_SINK_PATH.endswith("openclaw_memory.jsonl")) + self.assertIn(".llmrouter", DEFAULT_SINK_PATH) + + +class TestToTrainingRows(unittest.TestCase): + def setUp(self) -> None: + self.query = "Explain the tradeoffs of consensus algorithms." + + def test_responses_decompose_to_n_rows(self) -> None: + """N panel responses produce N per-model rows.""" + result = _sample_result() + rows = to_training_rows(result, self.query) + self.assertEqual(len(rows), len(result.responses)) + + models = [r["model_name"] for r in rows] + self.assertEqual( + models, + ["qwen2.5-7b-instruct", "llama-3.1-8b-instruct", "mistral-7b-instruct-v0.3"], + ) + + def test_row_required_schema_fields(self) -> None: + """Each row carries the FusionFactory-consumable schema.""" + rows = to_training_rows(_sample_result(), self.query) + for row in rows: + for field in ("query", "model_name", "model", "response", "performance"): + self.assertIn(field, row, f"missing field: {field}") + self.assertEqual(row["query"], self.query) + self.assertEqual(row["model_name"], row["model"]) + self.assertIsNone(row["performance"]) + + def test_rows_no_key_leakage(self) -> None: + """Inline secrets in response content are scrubbed in training rows.""" + result = FusionResult( + responses=[ + {"model": "qwen2.5-7b-instruct", "content": "token Bearer sk-leak0123456789abc here"}, + ], + panel=["qwen2.5-7b-instruct"], + ) + rows = to_training_rows(result, self.query) + blob = json.dumps(rows) + self.assertNotIn("sk-leak0123456789abc", blob) + self.assertIn("[REDACTED]", blob) + + def test_empty_responses_yield_no_rows(self) -> None: + """No panel responses -> no rows (fail-safe, not an error).""" + result = FusionResult(responses=[], panel=[]) + self.assertEqual(to_training_rows(result, self.query), []) + + +if __name__ == "__main__": + unittest.main() diff --git a/custom_routers/fusion_gate/tests/test_gate.py b/custom_routers/fusion_gate/tests/test_gate.py new file mode 100644 index 0000000..9e573fa --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_gate.py @@ -0,0 +1,202 @@ +"""Offline unit tests for ``RouteGate`` (UMB-119). + +These tests run fully offline with no network, no torch, and no trained model: +the lexical fallback is exercised directly, and the injected-estimator path is +driven by a plain Python stub. Compatible with pytest (``pytest test_gate.py``) +and also runnable standalone (``python test_gate.py``) since pytest is not a +hard dependency of this repo. + +Coverage: + - single tier for an easy / high-confidence query + - fusion tier for a hard / low-confidence query + - high_stakes override forces fusion regardless of difficulty + - the threshold is config-driven (passing a different threshold flips the tier) + - the cheapest model is selected on the single path + - the injected estimator overrides the lexical heuristic when an embedding is present + - confidence rises with distance from the threshold +""" + +from __future__ import annotations + +import importlib.util +import os +import sys + +# Load gate.py by file path rather than importing the package. The package +# __init__ pulls in router.py, which depends on torch; the gate has no such +# dependency, so loading the module directly keeps these tests torch-free and +# fully offline. The module is registered in sys.modules before execution so +# dataclass field-annotation resolution (PEP 563) can find its namespace. +_GATE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "gate.py")) +_spec = importlib.util.spec_from_file_location("fusion_gate_gate", _GATE_PATH) +_gate_mod = importlib.util.module_from_spec(_spec) +sys.modules[_spec.name] = _gate_mod +_spec.loader.exec_module(_gate_mod) + +GateDecision = _gate_mod.GateDecision +RouteGate = _gate_mod.RouteGate + +# Minimal candidate set mirroring default_llm.json shape (name -> prices). +LLM_DATA = { + "cheap-7b": {"input_price": 0.20, "output_price": 0.20}, + "mid-49b": {"input_price": 0.90, "output_price": 0.90}, + "big-141b": {"input_price": 1.20, "output_price": 1.20}, +} + + +def _gate(threshold: float = 0.5, estimator=None) -> RouteGate: + return RouteGate(llm_data=LLM_DATA, threshold=threshold, estimator=estimator) + + +# --------------------------------------------------------------------- tiers + + +def test_easy_query_routes_single_high_confidence(): + """A short, simple question stays on the cheap single path with high confidence.""" + gate = _gate(threshold=0.5) + decision = gate.decide({"query": "What is the capital of France?"}) + + assert isinstance(decision, GateDecision) + assert decision.tier == "single" + assert decision.model_name == "cheap-7b" # cheapest capable model + assert decision.panel == [] + assert decision.difficulty < 0.5 + assert decision.confidence > 0.5 # clearly below threshold => confident + + +def test_hard_query_routes_fusion_low_confidence_near_threshold(): + """A long, multi-part code/math query escalates to fusion.""" + gate = _gate(threshold=0.5) + query = ( + "Write a function to compute the integral of a matrix, then prove its " + "complexity, and also debug this regex; how do these interact with the " + "algorithm above and what is the derivative? " * 2 + ) + decision = gate.decide({"query": query}) + + assert decision.tier == "fusion" + assert decision.model_name is None # single-only model_name; None for fusion + assert decision.difficulty >= 0.5 + + +def test_high_stakes_forces_fusion_even_for_easy_query(): + """high_stakes overrides difficulty and forces fusion at full confidence.""" + gate = _gate(threshold=0.5) + decision = gate.decide({"query": "Hi", "high_stakes": True}) + + assert decision.tier == "fusion" + assert decision.model_name is None + assert decision.confidence == 1.0 # caller override => fully confident in fusing + + +def test_threshold_is_config_driven(): + """The same query flips tier purely based on the injected threshold.""" + query = {"query": "Explain how a hash map handles collisions and resizing."} + + # Difficulty for this query is some fixed value d in (0, 1). Compute it once. + d = _gate()._lexical_difficulty(query["query"]) + assert 0.0 < d < 1.0 # guard: must straddle for the test to be meaningful + + lenient = _gate(threshold=d + 0.1).decide(query) # threshold above d => single + strict = _gate(threshold=d - 0.1).decide(query) # threshold below d => fusion + + assert lenient.tier == "single" + assert strict.tier == "fusion" + + +# ------------------------------------------------------------ model selection + + +def test_single_path_selects_cheapest_model(): + """Single path always returns the lowest-cost candidate.""" + gate = _gate(threshold=0.99) # force single for almost anything + decision = gate.decide({"query": "easy"}) + assert decision.tier == "single" + assert decision.model_name == "cheap-7b" + + +# ------------------------------------------------------------- estimator path + + +def test_injected_estimator_overrides_lexical_heuristic(): + """When an embedding + estimator are provided, the estimator decides difficulty.""" + # Estimator ignores the embedding and returns a fixed hard score. + hard_estimator = lambda _embedding: 0.95 # noqa: E731 + gate = _gate(threshold=0.5, estimator=hard_estimator) + + # Query text alone would be "easy"; the estimator should override it. + decision = gate.decide({"query": "easy", "embedding": [0.0, 0.0, 0.0]}) + assert decision.tier == "fusion" + assert abs(decision.difficulty - 0.95) < 1e-9 + + +def test_estimator_output_with_item_is_coerced(): + """A tensor-like return (exposing .item()) is coerced to a float scalar.""" + + class _ScalarLike: + def __init__(self, value: float): + self._value = value + + def item(self) -> float: + return self._value + + gate = _gate(threshold=0.5, estimator=lambda _e: _ScalarLike(0.10)) + decision = gate.decide({"query": "anything", "embedding": [1.0]}) + assert decision.tier == "single" + assert abs(decision.difficulty - 0.10) < 1e-9 + + +def test_estimator_ignored_without_embedding(): + """No embedding => lexical fallback runs even if an estimator is wired in.""" + gate = _gate(threshold=0.5, estimator=lambda _e: 0.99) + decision = gate.decide({"query": "What is 2 plus 2?"}) # no embedding key + # Falls back to lexical heuristic (easy) rather than the estimator's 0.99. + assert decision.tier == "single" + assert decision.difficulty < 0.5 + + +# ----------------------------------------------------------------- confidence + + +def test_confidence_increases_with_distance_from_threshold(): + """Confidence is monotonic in the absolute margin to the threshold.""" + gate = _gate(threshold=0.5) + near = gate._confidence(0.5) # at the boundary + mid = gate._confidence(0.7) + far = gate._confidence(1.0) + assert near < mid < far + assert near == 0.0 + assert far == 1.0 + + +def test_lexical_difficulty_is_deterministic_and_bounded(): + """The heuristic is pure and always returns a value in [0, 1].""" + gate = _gate() + samples = ["", "hi", "Explain the algorithm.", "a" * 5000, "1. x? 2. y? and z?"] + for text in samples: + d = gate._lexical_difficulty(text) + assert 0.0 <= d <= 1.0 + assert gate._lexical_difficulty(text) == d # deterministic + + +# ----------------------------------------------------------------- runner + +def _run_all() -> int: + tests = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + failures = 0 + for test in tests: + try: + test() + print(f"PASS {test.__name__}") + except AssertionError as exc: # pragma: no cover - reporting path + failures += 1 + print(f"FAIL {test.__name__}: {exc}") + except Exception as exc: # pragma: no cover - reporting path + failures += 1 + print(f"ERROR {test.__name__}: {type(exc).__name__}: {exc}") + print(f"\n{len(tests) - failures}/{len(tests)} passed") + return 1 if failures else 0 + + +if __name__ == "__main__": + raise SystemExit(_run_all()) diff --git a/custom_routers/fusion_gate/tests/test_router.py b/custom_routers/fusion_gate/tests/test_router.py new file mode 100644 index 0000000..6de9113 --- /dev/null +++ b/custom_routers/fusion_gate/tests/test_router.py @@ -0,0 +1,380 @@ +"""Offline integration tests for ``FusionGateRouter`` (UMB-121/123/124). + +These exercise the router end-to-end through a temp YAML config and a tiny +in-memory candidate file. Fully offline: + + * no large data files — the temp config references only ``llm_data`` (no + routing_data), so MetaRouter's DataLoader loads nothing heavy; + * no network — the only fusion-execution test monkeypatches ``requests.post``; + * ``--route-only`` / routing paths make NO HTTP call and spend nothing. + +Coverage: + - all six config keys are read and respected (threshold, k, judge, + provider/base_url, panel_preset, cost_ceiling) + - cost_ceiling downgrades fusion -> single (abort) with no spend + - route_single is spend-free (the route-only contract): a decision dict, no + executor.run invocation + - panel varies by query type + - all three tiers (single / budget_fusion / fusion) are reachable + - fuse() logs every call via fusion_log, scrubbed and raw-payload-free +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any + +import pytest + +# Make the repo importable as a package root so ``custom_routers`` resolves. +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +# router.py imports torch eagerly (MetaRouter subclasses nn.Module). When torch +# is absent this module must SKIP cleanly rather than fail collection — otherwise +# it interrupts the whole suite and the four torch-free modules never run. See +# tests/conftest.py and fusion_gate/__init__.py for the torch-free design. +pytest.importorskip("torch") + +from custom_routers.fusion_gate.router import FusionGateRouter # noqa: E402 + +# Tiny candidate set (default_llm.json shape). Distinct prices/sizes so panel +# ordering and cost projection are deterministic. +LLM_DATA: dict[str, dict[str, Any]] = { + "cheap-7b": { + "size": "7B", + "feature": "fast and efficient small model", + "input_price": 0.20, + "output_price": 0.20, + "model": "vendor/cheap-7b", + "service": "OpenRouter", + }, + "mid-49b": { + "size": "49B", + "feature": "powerful high-accuracy model for complex tasks", + "input_price": 0.90, + "output_price": 0.90, + "model": "vendor/mid-49b", + "service": "OpenRouter", + }, + "big-141b": { + "size": "141B", + "feature": "advanced large-scale model with exceptional performance", + "input_price": 1.20, + "output_price": 1.20, + "model": "vendor/big-141b", + "service": "OpenRouter", + }, +} + + +def _write_config( + tmp_path: Path, + *, + threshold: float = 0.5, + budget_threshold: float | None = 0.3, + k: int = 2, + judge: str | None = None, + panel_preset: str = "Quality", + cost_ceiling: float | None = None, + log_sink_path: str | None = None, +) -> str: + """Write a tiny llm_data JSON + a router YAML; return the YAML path.""" + llm_path = tmp_path / "llm.json" + llm_path.write_text(json.dumps(LLM_DATA), encoding="utf-8") + + hparam_lines = [ + f" threshold: {threshold}", + f" k: {k}", + f" judge: {('null' if judge is None else judge)}", + f" panel_preset: '{panel_preset}'", + f" cost_ceiling: {('null' if cost_ceiling is None else cost_ceiling)}", + " provider: 'OpenRouter'", + " base_url: 'https://openrouter.ai/api/v1'", + f" budget_threshold: {('null' if budget_threshold is None else budget_threshold)}", + ] + if log_sink_path is not None: + hparam_lines.append(f" log_sink_path: '{log_sink_path}'") + + yaml_text = ( + "data_path:\n" + f" llm_data: '{llm_path}'\n" + "hparam:\n" + "\n".join(hparam_lines) + "\n" + "api_endpoint: 'https://openrouter.ai/api/v1'\n" + ) + cfg_path = tmp_path / "config.yaml" + cfg_path.write_text(yaml_text, encoding="utf-8") + return str(cfg_path) + + +def _router(tmp_path: Path, **kwargs) -> FusionGateRouter: + return FusionGateRouter(_write_config(tmp_path, **kwargs)) + + +# ----------------------------------------------------- config keys (UMB-121) + + +def test_all_six_config_keys_are_read(tmp_path): + r = _router( + tmp_path, + threshold=0.42, + k=3, + judge="big-141b", + panel_preset="Budget", + cost_ceiling=7.5, + ) + assert r.threshold == 0.42 + assert r.k == 3 + assert r.judge == "big-141b" + assert r.panel_preset == "Budget" + assert r.cost_ceiling == 7.5 + # provider/base_url pair (the 6th key). + assert r.provider == "OpenRouter" + assert r.base_url == "https://openrouter.ai/api/v1" + # base_url is threaded into the executor's endpoint. + assert r.executor.api_endpoint == "https://openrouter.ai/api/v1" + # threshold/k/judge are threaded into the gate/executor. + assert r.gate.threshold == 0.42 + assert r.executor.judge == "big-141b" + assert r.executor.cost_ceiling == 7.5 + + +# --------------------------------------------------- spend-free route-only + + +def test_route_single_is_spend_free(tmp_path, monkeypatch): + """route_single never invokes the executor (no API call / no spend).""" + r = _router(tmp_path, threshold=0.0) # force fusion for any query + + def boom(*args, **kwargs): + raise AssertionError("routing must not call the executor / network") + + monkeypatch.setattr(r.executor, "run", boom) + + decision = r.route_single({"query": "Solve this hard logic puzzle step by step"}) + assert decision["strategy"] == "fusion" + assert "panel" in decision and decision["panel"] + assert "judge" in decision + # Carries a model_name label for CLI route_query compatibility, but no call. + assert decision["model_name"] is not None + + +def test_route_only_decision_shape_for_single(tmp_path): + r = _router(tmp_path, threshold=0.99, budget_threshold=0.98) # force single + decision = r.route_single({"query": "Hi"}) + assert decision["strategy"] == "single" + assert decision["tier"] == "single" + assert decision["model_name"] in LLM_DATA + assert decision["predicted_llm"] == decision["model_name"] + + +# -------------------------------------------------- cost_ceiling (UMB-121) + + +def test_cost_ceiling_downgrades_fusion_to_single(tmp_path): + """A realistic per-query DOLLAR ceiling aborts fusion -> single, no spend. + + The projected cost is now an estimated dollar amount (~$0.001/query for this + k=2 panel), so a tight $0.0005 ceiling trips the guard. This guards against + the regression where the dollar projection silently no-op'd a sub-$1 ceiling. + """ + r = _router(tmp_path, threshold=0.0, k=2, cost_ceiling=0.0005) + decision = r.route_single({"query": "Solve this hard logic puzzle step by step"}) + + assert decision["strategy"] == "single" + assert decision["tier"] == "single" + assert decision["downgraded_from"] in ("budget_fusion", "fusion") + assert decision["projected_cost"] > 0.0005 + # Dollar-scale projection, not the old unit-price-sum proxy. + assert decision["projected_cost"] < 0.01 + assert decision["model_name"] in LLM_DATA # cheapest single fallback + + +def test_cost_ceiling_allows_fusion_when_under_cap(tmp_path): + # A realistic $0.05/query ceiling comfortably clears this cheap k=2 panel. + r = _router(tmp_path, threshold=0.0, k=2, cost_ceiling=0.05) + decision = r.route_single({"query": "Solve this hard logic puzzle step by step"}) + assert decision["strategy"] == "fusion" + assert decision["projected_cost"] <= 0.05 + + +# -------------------------------------------------- panel varies by query + + +def test_panel_varies_by_query_type(tmp_path): + """The capability-scored panel changes with the query category (UMB-123). + + Inject per-category routing performance so cheap-7b is best at code and + big-141b is best at reasoning; a code query and a reasoning query must then + produce different panels. + """ + r = _router(tmp_path, threshold=0.0, k=2) + # Swap in a capability scorer backed by category-discriminating routing data. + from custom_routers.fusion_gate.capability import CapabilityScorer + + routing_rows = [ + {"task_name": "humaneval-code", "model_name": "cheap-7b", "performance": 0.95}, + {"task_name": "humaneval-code", "model_name": "mid-49b", "performance": 0.30}, + {"task_name": "humaneval-code", "model_name": "big-141b", "performance": 0.20}, + {"task_name": "agentverse-logicgrid", "model_name": "cheap-7b", "performance": 0.10}, + {"task_name": "agentverse-logicgrid", "model_name": "mid-49b", "performance": 0.50}, + {"task_name": "agentverse-logicgrid", "model_name": "big-141b", "performance": 0.98}, + ] + r.capability = CapabilityScorer(llm_data=LLM_DATA, routing_data=routing_rows) + + code_panel = r.route_single({"query": "Write a function to fix this bug in my code"})["panel"] + reasoning_panel = r.route_single( + {"query": "Solve this logic puzzle, reason step by step"} + )["panel"] + + assert code_panel[0] == "cheap-7b" + assert reasoning_panel[0] == "big-141b" + assert code_panel != reasoning_panel + + +def test_preset_fallback_when_capability_unavailable(tmp_path): + """When capability scoring yields no panel, the configured preset drives it.""" + quality = _router(tmp_path, threshold=0.0, k=2, panel_preset="Quality") + budget = _router(tmp_path, threshold=0.0, k=2, panel_preset="Budget") + # Simulate "capability data unavailable" by making select_panel return None; + # the router must then fall back to the configured preset (resolved by price). + quality.capability.select_panel = lambda query, k: None + budget.capability.select_panel = lambda query, k: None + + q_panel = quality.route_single({"query": "general question"})["panel"] + b_panel = budget.route_single({"query": "general question"})["panel"] + + assert q_panel[0] == "big-141b" # Quality preset -> most capable first + assert b_panel[0] == "cheap-7b" # Budget preset -> cheapest first + assert q_panel != b_panel + + +# --------------------------------------------------- three tiers (UMB-124) + + +def test_all_three_tiers_reachable(tmp_path): + """single / budget_fusion / fusion are all reachable via the two thresholds.""" + r = _router(tmp_path, threshold=0.6, budget_threshold=0.2, k=2, cost_ceiling=None) + + # Easy query -> low difficulty -> single. + easy = r.route_single({"query": "Hi"}) + assert easy["tier"] == "single" + + # Force exact difficulties through the gate to land in each band. + r.gate.estimator = lambda _e: 0.4 # between budget_threshold and threshold + mid = r.route_single({"query": "x", "embedding": [0.0]}) + assert mid["tier"] == "budget_fusion" + + r.gate.estimator = lambda _e: 0.9 # above threshold + hard = r.route_single({"query": "x", "embedding": [0.0]}) + assert hard["tier"] == "fusion" + + +def test_budget_tier_uses_budget_preset_on_fallback(tmp_path): + """Mid-difficulty (budget_fusion) falls back to the cheap Budget panel. + + Even with panel_preset='Quality', the budget tier's fallback preset is Budget + so the cheap panel is used when capability data is unavailable (UMB-124). + """ + r = _router(tmp_path, threshold=0.6, budget_threshold=0.2, k=2, panel_preset="Quality") + r.capability.select_panel = lambda query, k: None # force preset fallback + r.gate.estimator = lambda _e: 0.4 # land in the budget_fusion band + + decision = r.route_single({"query": "x", "embedding": [0.0]}) + assert decision["tier"] == "budget_fusion" + assert decision["panel"][0] == "cheap-7b" # Budget preset -> cheapest first + + +# ----------------------------------------------- fuse() logs (UMB-125 wiring) + + +def test_fuse_logs_every_call_scrubbed(tmp_path, monkeypatch): + sink = tmp_path / "fusion_log.jsonl" + r = _router(tmp_path, threshold=0.0, k=2, log_sink_path=str(sink)) + + payload = { + "status": "ok", + "answer": "Fused answer.", + "analysis": {"consensus": "agree", "contradictions": [], "blind_spots": []}, + "responses": [ + {"model": "big-141b", "content": "leaked sk-abcdefghijklmnop secret"}, + {"model": "mid-49b", "content": "ok"}, + ], + "cost": 2.0, + } + + # Patch the executor's HTTP seam directly so the test is robust whether or + # not `requests` is installed (the executor falls back to urllib otherwise). + # No real network call is ever made. + monkeypatch.setattr( + r.executor, "_post_chat_completions", lambda body, api_key: payload + ) + + decision = r.route_single({"query": "Solve this hard logic puzzle step by step"}) + result = r.fuse(decision, api_keys={"OpenRouter": "sk-test-key"}) + + assert result.answer == "Fused answer." + assert sink.exists() + + lines = sink.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 1 + record = json.loads(lines[0]) + assert record["strategy"] == "fusion" + assert record["query"].startswith("Solve this hard logic puzzle") + # Raw provider payload must NOT be persisted, and inline secrets scrubbed. + assert "raw" not in record + serialized = json.dumps(record) + assert "sk-abcdefghijklmnop" not in serialized + assert "sk-test-key" not in serialized + + +def test_fuse_returns_result_when_log_sink_unwritable(tmp_path, monkeypatch): + """A logging failure must NOT destroy the already-computed FusionResult. + + The sink directory is made unwritable so ``log_fusion`` raises an OSError on + write; ``fuse()`` must swallow it and still return the result. + """ + locked_dir = tmp_path / "locked" + locked_dir.mkdir() + sink = locked_dir / "fusion_log.jsonl" + r = _router(tmp_path, threshold=0.0, k=2, log_sink_path=str(sink)) + + payload = { + "status": "ok", + "answer": "Fused answer.", + "analysis": {"consensus": "agree", "contradictions": [], "blind_spots": []}, + "responses": [{"model": "big-141b", "content": "ok"}], + "cost": 2.0, + } + monkeypatch.setattr( + r.executor, "_post_chat_completions", lambda body, api_key: payload + ) + + # Revoke write/execute so creating the file under it fails with OSError. + os.chmod(locked_dir, 0o500) + try: + decision = r.route_single({"query": "Solve this hard logic puzzle step by step"}) + result = r.fuse(decision, api_keys={"OpenRouter": "sk-test-key"}) + finally: + # Restore perms so tmp_path cleanup can remove the tree. + os.chmod(locked_dir, 0o700) + + # The result survives even though the log write failed. + assert result.answer == "Fused answer." + assert not sink.exists() + + +def test_fuse_rejects_non_fusion_result(tmp_path): + r = _router(tmp_path, threshold=0.99, budget_threshold=0.98) + decision = r.route_single({"query": "Hi"}) + assert decision["strategy"] == "single" + with pytest.raises(ValueError): + r.fuse(decision) + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"]))