diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index c7b96f5..e88e0b8 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "buidl", - "version": "6.0.0", - "description": "Full dev lifecycle for OP_NET Bitcoin L1 projects: idea → challenge → spec → build → review → ship. Self-learning across sessions with pattern extraction, agent performance scoring, score-based finding routing, project-type profiles, cross-layer validation, and starter templates. Includes shell-enforced E2E testing gates, frontend runtime smoke checks, PUA problem-solving methodology, the OP_NET Bible (2000+ lines), cross-agent critique, adversarial auditing, adversarial E2E testing, ABI-lock checkpoints, findings ledger with regression tracking, acceptance test generation, chain probe, hard gate enforcement, incremental audits, dry-run mode, execution tracing, dynamic re-planning from learned patterns, dynamic knowledge slice loading, property-based fuzz testing, and stale pattern pruning. Agents get smarter with every project.", + "version": "7.0.0", + "description": "Full dev lifecycle for OP_NET Bitcoin L1 projects: idea → challenge → spec → build → review → ship. Self-learning across sessions with pattern extraction, agent performance scoring, score-based finding routing, project-type profiles, cross-layer validation, and starter templates. Includes shell-enforced E2E testing gates, frontend runtime smoke checks, PUA problem-solving methodology, the OP_NET Bible (2000+ lines), cross-agent critique, adversarial auditing, adversarial E2E testing, ABI-lock checkpoints, findings ledger with regression tracking, acceptance test generation, chain probe, hard gate enforcement, incremental audits, dry-run mode, execution tracing, dynamic re-planning from learned patterns, dynamic knowledge slice loading, property-based fuzz testing, stale pattern pruning, mutation testing as loop exit gate, structured repair phases (R1/R2/R3), goal-oriented build evaluation, hierarchical repo map, and autoresearch optimize mode. Agents get smarter with every project.", "author": { "name": "dannyplainview + bob" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f35a4a..be2570f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Changelog +## [7.0.0] - 2026-03-13 + +### Added +- **Mutation testing as loop exit gate** (`scripts/mutate-contract.sh`): Applies 20 sed-level mutation operators to contract source files. For each mutant: creates a temp copy, applies the mutation, compiles, runs tests. If tests fail, the mutant is killed (good). If tests pass or compilation fails, the mutant survived (bad). Outputs `artifacts/testing/mutation-score.json` with total_mutants, killed, survived, compile_errors, mutation_score (0-1), threshold (0.70), verdict (PASS/FAIL), and survivors list. Phase 5 runs this gate before the reviewer -- score below 0.70 routes back to contract-dev with the survivors list. +- **Structured repair phases** (Agentless R1/R2/R3 pattern): Replaces "re-run agent with failure context" with three targeted phases. Phase R1 (LOCALIZE): max_turns 5, READ-ONLY, reviewer in localize mode produces localization.json. Phase R2 (PATCH): max_turns 10, domain agent receives localized context only, generates up to 3 candidate patches. Phase R3 (VALIDATE): automated, runs tests and mutation on each candidate, picks the best. +- **Failure localization script** (`scripts/localize-failure.sh`): Parses failure logs to extract file, function, line_range, suspected_cause, confidence, and failure_category. Outputs `artifacts/localization.json`. +- **Localize Mode** (`agents/loop-reviewer.md`): New reviewer mode for Phase R1 -- strict 5-turn READ-ONLY process. Produces localization.json only. Code generation is FORBIDDEN. +- **Goal-oriented build evaluation** (`scripts/score-build.sh`): Evaluates builds across 4 dimensions: spec_coverage (requirements with tests / total, threshold 90%), security_delta (open findings count, threshold 0), mutation_score (from mutation-score.json, threshold 70%), code_health (100 minus weighted penalties, threshold 60%). Outputs `artifacts/evaluation/progress-tracker.yaml`. All thresholds must be met. Failed dimensions route to responsible agents. +- **Requirements extraction** (`scripts/extract-requirements.sh`): Parses requirements.md and extracts individual requirements into `artifacts/evaluation/spec-requirements.yaml` with id, description, has_test, and priority fields. +- **Hierarchical cross-layer repo map** (`scripts/build-repo-map.sh`): Generates `artifacts/repo-map.md` with Contract Layer (from abi.json: methods, events, storage slots), Frontend Layer (components, hooks, services, contract calls), Backend Layer (routes, services, contract calls), and Cross-Layer Integrity Checks (missing methods, uncalled methods). Target under 300 lines. +- **Autoresearch optimize mode** (`commands/buidl-optimize.md`): New `/buidl-optimize` command for automated metric optimization. Supports gas, bundle_size, test_time, and throughput metrics. Runs a hypothesis-implement-benchmark-keep/revert cycle up to 10 times. Outputs summary.md, best-result.json, and auto-creates a PR with kept changes. + +### Changed +- **Orchestrator Phase 5** (`commands/buidl.md`): Mutation gate runs before reviewer dispatch. If mutation score < 0.70, routes back to contract-dev with survivors. Score-build runs after each review cycle, displaying a compact 4-dimension score table. All thresholds must be met for build completion. +- **Orchestrator agent failure handling** (`commands/buidl.md`): Agent failures now go through R1/R2/R3 structured repair before falling back to manual options. Localization produces targeted context, domain agents generate candidate patches, and validation picks the best one automatically. +- **Orchestrator Phase 4** (`commands/buidl.md`): Repo map generated after ABI lock (contract layer only), regenerated after all builders complete (all layers populated). +- **Orchestrator FAIL routing** (`commands/buidl.md`): Uses R1/R2/R3 structured repair for targeted fixes instead of raw agent re-dispatch with full failure context. +- **All 12 domain agent files**: Updated Step 0 / knowledge loading to reference `artifacts/repo-map.md` for cross-layer context. Agents: cross-layer-validator, loop-builder, loop-explorer, loop-researcher, loop-reviewer, opnet-auditor, opnet-backend-dev, opnet-contract-dev, opnet-deployer, opnet-e2e-tester, opnet-frontend-dev, opnet-ui-tester. +- **buidl-status** (`commands/buidl-status.md`): Shows mutation score ("Mutation: 83% (15/18 killed)") and 4-dimension build score card when available. Steps renumbered from 7-10 to 9-12. +- **loop-reviewer** (`agents/loop-reviewer.md`): Added Localize Mode section after Critique Mode. +- **Plugin version**: 6.0.0 -> 7.0.0 + +### Why +Four gaps identified in the build verification and repair systems. (1) The loop had no way to measure test quality -- tests could pass while missing entire categories of bugs. Mutation testing quantifies test effectiveness by checking whether tests detect deliberate code changes. (2) When agents failed, the entire failure context was re-injected, leading to unfocused repair attempts. Structured R1/R2/R3 phases localize the failure first, then generate targeted patches, then validate them automatically. (3) The reviewer produced a single PASS/FAIL verdict with no multi-dimensional visibility. Goal-oriented evaluation scores across 4 dimensions (spec coverage, security, mutation, code health) with clear thresholds and routing for each. (4) Agents had no shared map of how contract methods connected to frontend calls and backend routes. The hierarchical repo map provides cross-layer visibility, and integrity checks automatically detect missing or extra method calls. + ## [6.0.0] - 2026-03-13 ### Added diff --git a/README.md b/README.md index d749376..866e2e2 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ alias claudeyproj="claude --dangerously-skip-permissions --plugin-dir /path/to/b | `/buidl-clean` | Cancel + remove worktree and branch | | `/buidl-trace` | Show agent execution trace timeline for the current session | | `/buidl-learning` | Show learning system health report (patterns, scores, profiles) | +| `/buidl-optimize ` | Optimize gas, bundle_size, test_time, or throughput via automated experimentation | ### Flags @@ -143,6 +144,21 @@ Agents no longer load the full 2000-line bible regardless of role. `scripts/load #### Property-Based Fuzz Testing `scripts/fuzz-contract.sh` reads a contract ABI, extracts method signatures and parameter types, and generates boundary test cases: u256 values [0, 1, 2^128, 2^256-1, 2^256-2], address values [zero, contract, caller], bool values [true, false]. Produces all single-param boundary combinations plus 10 random combinations per method. Output goes to `artifacts/testing/fuzz-cases.json` and feeds into both the adversarial auditor and adversarial E2E tester. Does not send transactions. +#### Mutation Testing Gate +`scripts/mutate-contract.sh` applies 20 mutation operators to contract source and checks whether tests catch each mutation. Mutations include arithmetic swaps (add/sub, mul/div), comparison inversions, boolean flips, revert removal, constant swaps, and event removal. The mutation score (killed/total) must be >= 70% to proceed to review. Surviving mutants are routed back to contract-dev with specific details of what the tests missed. + +#### Structured Repair Phases (R1/R2/R3) +When agents fail, repair follows three targeted phases instead of blindly re-running with full context. R1 (LOCALIZE): the reviewer in localize mode identifies the exact file, function, and line range. R2 (PATCH): the domain agent receives only the localized context and generates up to 3 candidate patches. R3 (VALIDATE): patches are tested and scored automatically, and the best one is applied. + +#### Goal-Oriented Build Evaluation +`scripts/score-build.sh` evaluates builds across 4 dimensions: spec coverage (requirements with tests), security delta (open findings), mutation score, and code health. Each dimension has a threshold and routes to the responsible agent on failure. The compact score table is displayed after every review cycle. + +#### Hierarchical Repo Map +`scripts/build-repo-map.sh` generates a cross-layer map from the ABI, frontend source, and backend source. Shows contract methods with signatures, frontend components and their contract calls, backend routes and their contract calls, and auto-detects missing methods (called but not in ABI) and uncalled methods (in ABI but never referenced). + +#### Autoresearch Optimize Mode +`/buidl-optimize gas` runs an automated optimization loop: hypothesize, implement, benchmark, keep/revert. Supports gas, bundle_size, test_time, and throughput metrics. Default 10 cycles. No test regressions allowed. Produces a summary and auto-creates a PR with kept changes. + #### Dynamic Re-Planning When an agent fails after retry, the orchestrator queries `learning/patterns.yaml` for known fix patterns matching the failure category. If a match is found, it presents a 5th option ("Apply known fix: [description]") alongside the standard 4 error-handling options. Lessons from past sessions are applied automatically instead of requiring manual intervention. @@ -301,6 +317,7 @@ The auditor and reviewer check for 27 confirmed vulnerability patterns extracted | checkpoint after each agent + cost log v Phase 5: REVIEW + Mutation gate (>= 70% required) + 4-dim score card Reviewer checks PR against spec + 27 patterns | checkpoint v @@ -317,9 +334,9 @@ The auditor and reviewer check for 27 confirmed vulnerability patterns extracted ``` buidl/ +-- .claude-plugin/ -| +-- plugin.json # Plugin manifest (v6.0.0) +| +-- plugin.json # Plugin manifest (v7.0.0) +-- agents/ # 14 agent definitions (incl. adversarial auditor + tester) -+-- commands/ # 10 slash commands (incl. buidl-trace, buidl-learning) ++-- commands/ # 11 slash commands (incl. buidl-optimize) +-- hooks/ # Stop hook + state guards | +-- scripts/ +-- knowledge/ # OPNet reference + domain slices @@ -328,14 +345,14 @@ buidl/ | +-- patterns.yaml # Structured pattern store (auto-updated) | +-- agent-scores.yaml # Agent performance metrics (auto-updated) | +-- profiles/ # Auto-generated project-type profiles -+-- scripts/ # Setup + state writer + learning + routing + tracing + fuzz + knowledge scripts ++-- scripts/ # Setup + state + learning + routing + tracing + fuzz + mutation + scoring + repo-map scripts +-- skills/ # 3 triggerable skills | +-- audit-from-bugs/ | +-- loop-guide/ | +-- pua/ +-- templates/ # Domain agent, knowledge slice, starter templates | +-- starters/ # Project scaffolds (op20-token, more planned) -+-- tests/ # 419 structural + functional + integration tests ++-- tests/ # 450+ structural + functional + integration tests ``` ## Testing @@ -344,7 +361,7 @@ buidl/ bash tests/plugin-tests.sh ``` -434+ tests across 53 categories covering shell syntax, agent structure, FORBIDDEN blocks, knowledge references, issue bus schema, version consistency, state guards, resume logic, learning system, templates, cost tracking, wall-clock timeout, max_turns, integration tests, transaction simulation, Playwright E2E, adaptive learning, cross-layer validation, starter templates, score-based routing, project-type profiles, cross-agent critique, incremental audit, dry-run mode, agent tracing, dynamic re-planning, acceptance test locking, ABI-lock, adversarial auditing, adversarial E2E testing, failure diagnosis, findings ledger, chain probe, hard gate enforcement, and regression tracking. +450+ tests across 57 categories covering shell syntax, agent structure, FORBIDDEN blocks, knowledge references, issue bus schema, version consistency, state guards, resume logic, learning system, templates, cost tracking, wall-clock timeout, max_turns, integration tests, transaction simulation, Playwright E2E, adaptive learning, cross-layer validation, starter templates, score-based routing, project-type profiles, cross-agent critique, incremental audit, dry-run mode, agent tracing, dynamic re-planning, acceptance test locking, ABI-lock, adversarial auditing, adversarial E2E testing, failure diagnosis, findings ledger, chain probe, hard gate enforcement, regression tracking, mutation testing, structured repair phases, goal-oriented evaluation, repo map, and autoresearch optimize. Tests run automatically on every push and PR via GitHub Actions. @@ -352,6 +369,9 @@ Tests run automatically on every push and PR via GitHub Actions. ## Version History +### v7.0.0 — Mutation + Repair + Scoring (2026-03-13) +Four verification and repair improvements: **Mutation testing gate** applies 20 operators to contract source, requiring >= 70% kill rate before review. **Structured repair phases** (R1/R2/R3) localize failures, generate targeted patches, and validate automatically. **Goal-oriented build evaluation** scores builds across 4 dimensions (spec coverage, security, mutation, code health) with routing for each failed dimension. **Hierarchical repo map** provides cross-layer visibility from ABI to frontend/backend calls. Plus **autoresearch optimize mode** for automated metric improvement. + ### v6.0.0 — Dynamic Knowledge (2026-03-13) Three knowledge and learning system improvements: **Dynamic knowledge slice loading** assembles role-specific knowledge payloads per agent, filtering the 2000-line bible to only role-relevant sections and keeping payloads under 400 lines. **Property-based fuzz case generation** creates structured boundary test cases from ABI signatures for adversarial auditing. **Stale pattern pruning** with version-based staleness tracking and a `/buidl-learning` health report. diff --git a/agents/cross-layer-validator.md b/agents/cross-layer-validator.md index 7eb6ce4..671e306 100644 --- a/agents/cross-layer-validator.md +++ b/agents/cross-layer-validator.md @@ -47,6 +47,7 @@ You are the **Cross-Layer Validator** agent. You check integration correctness a Before any validation: 1. Load your knowledge payload via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh cross-layer-validator ` — this assembles your domain slice (cross-layer-validation.md), troubleshooting guide, and learned patterns. 2. If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md). +3. If `artifacts/repo-map.md` exists, read it for cross-layer method mapping and integrity checks. ## Process diff --git a/agents/loop-builder.md b/agents/loop-builder.md index f12cc66..4917ecb 100644 --- a/agents/loop-builder.md +++ b/agents/loop-builder.md @@ -153,6 +153,8 @@ Check in this order: - Use `deriveOPWallet()` not `derive()` for OPWallet-compatible keys - `Buffer` is gone — use `BufferHelper` from `@btc-vision/transaction` +4. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). + --- ## Step 0.5: Load PUA Methodology (MANDATORY) diff --git a/agents/loop-explorer.md b/agents/loop-explorer.md index 4e71da9..2589eb8 100644 --- a/agents/loop-explorer.md +++ b/agents/loop-explorer.md @@ -48,6 +48,7 @@ Before starting analysis, check if this is an OPNet project: 1. Read `package.json` — look for `@btc-vision/*` or `opnet` in dependencies. 2. Check for `asconfig.json` (contract project), `vite.config.ts` (frontend), or `@btc-vision/hyper-express` (backend). 3. If OPNet detected: load knowledge via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh loop-explorer ` — this assembles the project-setup.md slice, troubleshooting guide, and learned patterns. This informs what patterns to look for. +4. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). ## Process diff --git a/agents/loop-researcher.md b/agents/loop-researcher.md index 1d8a4fa..4f3c2a8 100644 --- a/agents/loop-researcher.md +++ b/agents/loop-researcher.md @@ -36,6 +36,8 @@ Before searching, read the feature description you were given carefully. Identif For Bitcoin/OPNet projects: prioritize searching the OPNet ecosystem first — btc-vision GitHub repos (github.com/btc-vision/*), OPNet docs, and existing OPNet dApps. Most OPNet patterns already have reference implementations (MotoSwap for DEX, NativeSwap for BTC-token swaps, etc.). +4. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). + ## Process ### Step 1: Search for Existing Solutions diff --git a/agents/loop-reviewer.md b/agents/loop-reviewer.md index b1e5343..8068c8a 100644 --- a/agents/loop-reviewer.md +++ b/agents/loop-reviewer.md @@ -44,6 +44,7 @@ Before reviewing: 2. Read the PR diff via `gh pr diff `. 3. If this is an OPNet project (check for `@btc-vision/*` or `opnet` in package.json), load knowledge via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh loop-reviewer ` — this assembles the integration-review.md slice, troubleshooting guide, and learned patterns. 4. Read [skills/pua/SKILL.md](skills/pua/SKILL.md) for the proactivity checklist -- use it to evaluate whether builders were thorough. +5. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). **Review Proactivity (from PUA):** - For each finding: check if similar issues exist elsewhere in the diff. @@ -281,6 +282,42 @@ Write `artifacts/cross-critique.md` with structured findings: CRITICAL findings from cross-critique are routed back to the original builder agent by the orchestrator. +## Localize Mode + +When dispatched in **localize mode** by the orchestrator (for structured repair Phase R1), use a failure-focused READ-ONLY process: + +- **max_turns**: 5 (strict limit) +- **Mode**: READ-ONLY. You CANNOT modify any files. You CANNOT generate code. You CANNOT suggest patches. +- **Focus**: Determine exactly WHERE and WHY a failure occurred. +- **Scope**: Read only the failure log, the referenced source files, and the test files. + +Process: +1. Read the failure log provided in the dispatch context. +2. Run `bash ${CLAUDE_PLUGIN_ROOT}/scripts/localize-failure.sh ` to get initial localization. +3. Read the file and line range identified in `artifacts/localization.json`. +4. Read related test files to understand what the test expected vs what happened. +5. Refine the localization if the initial output has low confidence. + +Write `artifacts/localization.json` with refined data: + +```json +{ + "file": "src/contracts/MyContract.ts", + "function": "transfer", + "line_range": [42, 55], + "suspected_cause": "SafeMath.sub underflows when balance is zero", + "confidence": "high", + "failure_category": "test_failure" +} +``` + +**FORBIDDEN in Localize Mode:** +- Generating code or patches +- Modifying any files +- Suggesting fixes (that is Phase R2's job) +- Running more than 5 turns +- Expanding scope beyond the failure context + ## Rules 1. **Be specific.** "This could be better" is worthless. "The switch at auth.ts:42 doesn't handle the 'expired' case" is actionable. diff --git a/agents/opnet-auditor.md b/agents/opnet-auditor.md index 4855a7b..a107825 100644 --- a/agents/opnet-auditor.md +++ b/agents/opnet-auditor.md @@ -48,6 +48,7 @@ You are the **OPNet Security Auditor** agent. You perform security audits on OPN Before auditing ANY code: 1. Load your knowledge payload via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh opnet-auditor ` — this assembles your domain slice (security-audit.md), troubleshooting guide, relevant bible sections ([SECURITY]), and learned patterns. 2. Read [skills/pua/SKILL.md](skills/pua/SKILL.md) for debugging discipline. As an auditor, apply the "Verify, don't assume" and "Read completely" principles to every finding. +3. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). **Audit Discipline (from PUA + GSD-2):** - Read entire functions and their imports, not just the line that looks suspicious. diff --git a/agents/opnet-backend-dev.md b/agents/opnet-backend-dev.md index 42e557e..5202ce0 100644 --- a/agents/opnet-backend-dev.md +++ b/agents/opnet-backend-dev.md @@ -46,6 +46,7 @@ Before writing ANY code: 1. Load your knowledge payload via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh opnet-backend-dev ` — this assembles your domain slice (`knowledge/slices/backend-dev.md`), troubleshooting guide, relevant bible sections ([BACKEND]), and learned patterns. 2. Read [skills/pua/SKILL.md](skills/pua/SKILL.md) COMPLETELY. This is your problem-solving methodology. 3. If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md). +4. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). **The PUA methodology applies throughout your session:** exhaust all options before escalating, act before asking, take initiative, verify after every fix. diff --git a/agents/opnet-contract-dev.md b/agents/opnet-contract-dev.md index b2b9d65..008806b 100644 --- a/agents/opnet-contract-dev.md +++ b/agents/opnet-contract-dev.md @@ -54,6 +54,7 @@ Before writing ANY code: 1. Load your knowledge payload via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh opnet-contract-dev ` — this assembles your domain slice (`knowledge/slices/contract-dev.md`), troubleshooting guide, the full bible (all sections), and learned patterns. 2. Read [skills/pua/SKILL.md](skills/pua/SKILL.md) COMPLETELY. This is your problem-solving methodology. 3. If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md). +4. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). **The PUA methodology applies throughout your session:** exhaust all options before escalating, act before asking, take initiative, verify after every fix. diff --git a/agents/opnet-deployer.md b/agents/opnet-deployer.md index 0287e30..1c05428 100644 --- a/agents/opnet-deployer.md +++ b/agents/opnet-deployer.md @@ -55,6 +55,8 @@ Also read [knowledge/slices/transaction-simulation.md](knowledge/slices/transact If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md). +If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). + ## Process ### 1. Pre-Deploy Verification (MANDATORY) diff --git a/agents/opnet-e2e-tester.md b/agents/opnet-e2e-tester.md index b1a28a8..ce418c5 100644 --- a/agents/opnet-e2e-tester.md +++ b/agents/opnet-e2e-tester.md @@ -72,6 +72,8 @@ Also read [knowledge/slices/transaction-simulation.md](knowledge/slices/transact If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md) and query the opnet-bob MCP server. +If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). + ## Inputs You receive: diff --git a/agents/opnet-frontend-dev.md b/agents/opnet-frontend-dev.md index d4ae9b5..31e2b96 100644 --- a/agents/opnet-frontend-dev.md +++ b/agents/opnet-frontend-dev.md @@ -55,6 +55,7 @@ Before writing ANY code: 2. Also read [knowledge/slices/transaction-simulation.md](knowledge/slices/transaction-simulation.md) -- the "Frontend Simulation Pattern" section. 3. Read [skills/pua/SKILL.md](skills/pua/SKILL.md) COMPLETELY. This is your problem-solving methodology. 4. If you encounter issues, check [knowledge/opnet-troubleshooting.md](knowledge/opnet-troubleshooting.md). +5. If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). **The PUA methodology applies throughout your session:** exhaust all options before escalating, act before asking, take initiative, verify after every fix. diff --git a/agents/opnet-ui-tester.md b/agents/opnet-ui-tester.md index f2fc00f..89082f4 100644 --- a/agents/opnet-ui-tester.md +++ b/agents/opnet-ui-tester.md @@ -50,6 +50,8 @@ You are the **OPNet UI Tester** agent. You test OPNet dApp frontends using Playw Load your knowledge payload via `bash ${CLAUDE_PLUGIN_ROOT}/scripts/load-knowledge.sh opnet-ui-tester ` — this assembles your domain slice (`knowledge/slices/ui-testing.md`), troubleshooting guide, and learned patterns. +If `artifacts/repo-map.md` exists, read it for cross-layer context (contract methods, frontend components, backend routes, integrity checks). + ## Process ### Step 1: Setup diff --git a/commands/buidl-optimize.md b/commands/buidl-optimize.md new file mode 100644 index 0000000..376feb9 --- /dev/null +++ b/commands/buidl-optimize.md @@ -0,0 +1,138 @@ +--- +description: "Optimize a metric (gas, bundle_size, test_time, throughput) via automated experimentation" +argument-hint: ' [--target N] [--max-cycles 10]' +allowed-tools: ["Read", "Write", "Edit", "Bash(bash:*)", "Grep", "Glob"] +--- + +# Autoresearch Optimize Mode + +You are running an optimization loop that iterates toward a measurable improvement target. Follow the cycle below. Do not skip steps. + +## FORBIDDEN + +1. **No mainnet transactions.** All experiments run on testnet or local only. +2. **No modifying locked acceptance tests.** Files in `artifacts/acceptance-tests/` are immutable. +3. **No test regressions.** Every cycle must pass the full test suite before keeping changes. +4. **No unrelated changes.** Only touch code that affects the target metric. + +## Parse Input + +Arguments: `$ARGUMENTS` + +Supported metrics: +- `gas` — Reduce contract gas consumption +- `bundle_size` — Reduce frontend bundle size (bytes) +- `test_time` — Reduce test suite execution time (seconds) +- `throughput` — Increase transactions per second + +Default `--max-cycles`: 10 +Default `--target`: metric-dependent (gas: -10%, bundle_size: -15%, test_time: -20%, throughput: +20%) + +## Step 0: Baseline + +1. Run the full test suite to confirm all tests pass. If any fail, STOP and report. +2. Measure the current value of the target metric: + - `gas`: compile contract, read gas report from build output + - `bundle_size`: run `npm run build` and measure `dist/` size + - `test_time`: time the test suite execution + - `throughput`: run benchmark suite if available +3. Record the baseline in `artifacts/optimize/baseline.json`: + ```json + { + "metric": "", + "baseline_value": , + "target_value": , + "unit": "", + "timestamp": "" + } + ``` + +## Optimization Cycle + +Repeat up to `max_cycles` times: + +### 1. Hypothesize + +State a specific hypothesis: "Changing X in file Y will reduce metric by approximately Z because [reason]." + +Write the hypothesis to `artifacts/optimize/cycle--hypothesis.md`. + +### 2. Implement + +Make the minimal code change to test the hypothesis. Keep changes small and reversible. + +### 3. Benchmark + +1. Run the full test suite. If any test fails, REVERT immediately and try a different hypothesis. +2. Measure the target metric with the same method as Step 0. +3. Record the result in `artifacts/optimize/cycle--result.json`: + ```json + { + "cycle": , + "hypothesis": "", + "metric_before": , + "metric_after": , + "delta": , + "delta_pct": , + "tests_pass": true, + "kept": + } + ``` + +### 4. Keep or Revert + +- If the metric improved AND all tests pass: KEEP the change. Update the running best. +- If the metric worsened OR any test failed: REVERT via `git checkout -- .` to discard all uncommitted changes. +- If the metric is unchanged: REVERT (no point keeping neutral changes). + +### 5. Check Target + +- If the cumulative improvement meets or exceeds the target: STOP, declare success. +- If `cycle >= max_cycles`: STOP, report best result achieved. +- Otherwise: continue to next cycle. + +## Output + +When done, write: + +1. `artifacts/optimize/summary.md`: + ```markdown + # Optimization Summary + + ## Target + Metric: + Baseline: + Target: + Best achieved: (% improvement) + + ## Cycles + | Cycle | Hypothesis | Before | After | Delta | Kept | + |-------|-----------|--------|-------|-------|------| + | 1 | ... | ... | ... | ... | Y/N | + + ## Conclusion + [Summary of what worked, what did not, and why] + ``` + +2. `artifacts/optimize/best-result.json`: + ```json + { + "metric": "", + "baseline_value": , + "best_value": , + "improvement_pct": , + "target_met": , + "cycles_used": , + "max_cycles": + } + ``` + +3. Create a PR with the kept changes: + ```bash + git checkout -b optimize/- + git add -A + git commit -m "optimize: reduce by %" + gh pr create --title "optimize: -%" --body "..." + ``` + +Print the PR URL when done. diff --git a/commands/buidl-status.md b/commands/buidl-status.md index cf4a30d..0937765 100644 --- a/commands/buidl-status.md +++ b/commands/buidl-status.md @@ -42,10 +42,20 @@ Elapsed: [computed from started_at, or "unknown"] ``` Learning: [N patterns (N stale) / N agents scored / N profiles] ``` -7. If there are review files in the session directory, show the latest verdict. -8. If a checkpoint file exists at `.claude/loop/sessions//checkpoint.md`, show the last checkpoint timestamp and next action. -9. If the status is `done`, `failed`, `cancelled`, or `timed_out`, include the summary. -10. **Orphan worktree detection**: Run `git worktree list` and cross-reference with the active state: +7. **Mutation Score**: If `artifacts/testing/mutation-score.json` exists, parse and show: + ``` + Mutation: [X]% ([killed]/[total] killed) — [verdict] + ``` + Example: `Mutation: 83% (15/18 killed) — PASS` +8. **Build Score Card**: If `artifacts/evaluation/progress-tracker.yaml` exists, parse and show the 4-dimension scores: + ``` + Build Score: spec=[X]% security=[N] mutation=[X]% health=[X]% — [verdict] + ``` + Example: `Build Score: spec=95% security=0 mutation=83% health=80% — PASS` +9. If there are review files in the session directory, show the latest verdict. +10. If a checkpoint file exists at `.claude/loop/sessions//checkpoint.md`, show the last checkpoint timestamp and next action. +11. If the status is `done`, `failed`, `cancelled`, or `timed_out`, include the summary. +12. **Orphan worktree detection**: Run `git worktree list` and cross-reference with the active state: - For each worktree under `.claude/worktrees/loop-*`, check if there's a matching session in state. - If a worktree exists but no state file references it (or state is `done`/`cancelled`), flag it as orphaned: ``` diff --git a/commands/buidl.md b/commands/buidl.md index 0b6e92b..9f450f5 100644 --- a/commands/buidl.md +++ b/commands/buidl.md @@ -635,23 +635,47 @@ Critique dispatch: **If an agent fails:** 1. Retry once, including the error output in the retry prompt. 2. Log a trace event: `bash ${CLAUDE_PLUGIN_ROOT}/scripts/trace-event.sh error build "Agent failed after retry"` -3. If retry also fails, check for a known fix pattern: +3. If retry also fails, run the **Structured Repair Phases** (Agentless Pattern): + + **Phase R1 -- LOCALIZE** (max_turns: 5, READ-ONLY): + Run failure localization on the agent's error output: + ```bash + bash ${CLAUDE_PLUGIN_ROOT}/scripts/localize-failure.sh + ``` + This produces `artifacts/localization.json` with: file, function, line_range, suspected_cause, confidence, failure_category. The reviewer is dispatched in **localize mode** (see loop-reviewer.md Localize Mode). Output is localization.json only -- NO code generation. + + **Phase R2 -- PATCH** (max_turns: 10): + Dispatch the domain agent (the one that failed) with ONLY the localized context: + - The localization.json file + - The specific file and line range identified + - Instruction: "Generate up to 3 candidate patches for the issue at {file}:{line_range}. Each patch should be a minimal fix addressing: {suspected_cause}." + The agent produces up to 3 candidate patches in `artifacts/repair/patch-1.diff`, `patch-2.diff`, `patch-3.diff`. + + **Phase R3 -- VALIDATE** (automated): + For each candidate patch: + 1. Apply the patch to a temp copy + 2. Run the full test suite + 3. If contract: run mutation testing + 4. Score the result: tests passing + mutation score + Pick the best-scoring patch and apply it. If no patch passes tests, fall through to the manual options below. + +4. If R1/R2/R3 produced a working fix, continue the loop. Otherwise, check for a known fix pattern: ```bash PATTERN_MATCH=$(bash ${CLAUDE_PLUGIN_ROOT}/scripts/query-pattern.sh "" "" 2>/dev/null || true) ``` -4. If a pattern match is found (`PATTERN_MATCH` is non-empty), present AskUserQuestion with 5 numbered options: +5. If a pattern match is found (`PATTERN_MATCH` is non-empty), present AskUserQuestion with 5 numbered options: - "Apply known fix: [description from pattern match]" - "Retry with a different approach" - "Skip this agent and continue" - "Amend the spec to work around this" - "Cancel the loop" If the user selects "Apply known fix", apply the fix from the pattern, log a replan trace event, and retry the agent. -5. If no pattern match is found, present AskUserQuestion with 4 numbered options: +6. If no pattern match is found, present AskUserQuestion with 4 numbered options: - "Retry with a different approach" - "Skip this agent and continue" - "Amend the spec to work around this" - "Cancel the loop" -6. Never ask open-ended questions like "what should I do?" +7. Never ask open-ended questions like "what should I do?" #### Step 2a: Contract Development (if components.contract = true) @@ -673,6 +697,13 @@ After contract-dev completes successfully: This hash is verified before frontend/backend dispatch to detect unauthorized ABI modifications. +**Generate Repo Map (after ABI lock):** +After contract-dev completes and ABI is locked, generate the hierarchical repo map: +```bash +bash ${CLAUDE_PLUGIN_ROOT}/scripts/build-repo-map.sh artifacts/contract/abi.json "" "" +``` +This creates `artifacts/repo-map.md` with the Contract Layer populated from the ABI. Frontend and Backend layers will be populated after those agents complete. All domain agents reference this map for cross-layer awareness. + #### Issue Check: Post-Contract (CONDITIONAL) **Only runs when `components.count >= 2` (multi-component build). Single-component builds skip this.** @@ -765,6 +796,13 @@ Launch `cross-layer-validator` agent: This step catches ABI mismatches, wrong method names, parameter type errors, contract address inconsistencies, and network config conflicts BEFORE the auditor runs — saving entire audit cycles. +**Regenerate Repo Map (after builders complete):** +After all builders and cross-layer validation are done, regenerate the repo map with all layers populated: +```bash +bash ${CLAUDE_PLUGIN_ROOT}/scripts/build-repo-map.sh artifacts/contract/abi.json "[WORKTREE]/frontend" "[WORKTREE]/backend" +``` +The updated `artifacts/repo-map.md` now has Contract, Frontend, and Backend layers plus cross-layer integrity checks. The auditor and reviewer reference this map for integration context. + #### Step 2c: Security Audit **Incremental Audit (cycle >= 2):** If this is cycle 2 or later, construct the auditor prompt with incremental context: @@ -983,6 +1021,21 @@ If the spec has fewer than 5 tasks and touches a single domain, fall back to the Update state: `current_phase: review`, `status: reviewing` +### Mutation Gate (before reviewer dispatch) + +If a contract was built in this session (`components.contract = true`), run mutation testing before the reviewer: + +```bash +bash ${CLAUDE_PLUGIN_ROOT}/scripts/mutate-contract.sh +``` + +Read `artifacts/testing/mutation-score.json` and check the verdict: +- If `mutation_score < 0.70` (verdict: FAIL): DO NOT dispatch the reviewer. Instead, route back to `opnet-contract-dev` with the survivors list. Include: "Mutation testing failed: {killed}/{total} mutants killed (score: {mutation_score}). These mutations survived — your tests do not cover them: {survivors}. Add tests to kill these mutants before proceeding." +- If `mutation_score >= 0.70` (verdict: PASS): proceed to reviewer dispatch. +- If mutation-score.json does not exist or has errors: log a warning and proceed (do not block on mutation infrastructure issues). + +### Reviewer Dispatch + Launch the `loop-reviewer` agent. Give it: 1. The three spec documents. @@ -1025,6 +1078,7 @@ Save the review output to `.claude/loop/sessions//reviews/cycle-.md`. - For OPNet projects: candidate agents are `opnet-contract-dev,opnet-frontend-dev,opnet-backend-dev`. - For generic projects: candidate agents are derived from the agents that were dispatched in this session. - After routing, write a categorized findings file to `artifacts/findings-categorized.md` for use by `update-scores.sh --findings` during wrap-up. Format per finding: `agent: | category: | outcome: pending` + - **Structured Repair (v7.0):** When routing findings to agents, use the R1/R2/R3 repair phases instead of raw re-dispatch. Run `localize-failure.sh` on the failure context first (Phase R1), then dispatch the agent with localized context only (Phase R2), then validate candidate patches (Phase R3). This replaces "re-run agent with failure context" for more targeted repairs. - If no (max cycles reached): update state to `failed`. Generate failure diagnosis and print remaining findings with the PR URL. The human takes over. ### Findings Ledger @@ -1049,6 +1103,30 @@ After Phase 5 review completes (every cycle), parse the reviewer's findings and 5. **3-cycle archiving rule**: For findings where `(current_cycle - cycle_found) > 3`, move them to an "Archived Findings" section at the bottom of the ledger. Archived findings are not checked for regression — they are historical records only. 6. For cycle 2+ reviewer dispatch: include the ledger in the prompt with instruction: "Check all RESOLVED findings for regression. Mark regressions as CRITICAL with [REGRESSION] tag." +### Goal-Oriented Build Scoring + +After each review cycle (pass or fail), run the build scoring script: + +```bash +bash ${CLAUDE_PLUGIN_ROOT}/scripts/score-build.sh +``` + +This evaluates 4 dimensions and writes `artifacts/evaluation/progress-tracker.yaml`: + +| Dimension | Threshold | Route on Fail | +|-----------|-----------|---------------| +| spec_coverage | >= 90% | loop-reviewer (spec gaps) | +| security_delta | <= 0 | opnet-auditor (open findings) | +| mutation_score | >= 70% | opnet-contract-dev (untested paths) | +| code_health | >= 60% | responsible builder (quality issues) | + +Display the compact score table in the review summary. ALL thresholds must be met for the build to be considered complete. Failed dimensions route to the responsible agent with specific remediation context. + +If `spec-requirements.yaml` does not exist yet, run `extract-requirements.sh` first: +```bash +bash ${CLAUDE_PLUGIN_ROOT}/scripts/extract-requirements.sh +``` + ### Structured Failure Diagnosis When `cycle >= max_cycles` and the verdict is FAIL, generate `artifacts/failure-diagnosis.md`: diff --git a/scripts/build-repo-map.sh b/scripts/build-repo-map.sh new file mode 100755 index 0000000..d9335e5 --- /dev/null +++ b/scripts/build-repo-map.sh @@ -0,0 +1,315 @@ +#!/bin/bash +# build-repo-map.sh — Hierarchical cross-layer repository map generator +# +# Usage: bash scripts/build-repo-map.sh [abi-json-path] [frontend-dir] [backend-dir] +# +# Generates artifacts/repo-map.md with sections: +# - Contract Layer (from abi.json): class, methods with signatures, storage slots, events +# - Frontend Layer (populated after frontend-dev) +# - Backend Layer (populated after backend-dev) +# - Cross-Layer Integrity Checks (auto-generated: missing methods, extra calls) +# +# Target: < 300 lines +# +# Exit codes: +# 0 — Success +# 1 — Error during generation + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +ABI_PATH="${1:-}" +FRONTEND_DIR="${2:-}" +BACKEND_DIR="${3:-}" + +OUTPUT_DIR="${SCRIPT_DIR}/artifacts" +mkdir -p "$OUTPUT_DIR" + +export _REPOMAP_ABI="$ABI_PATH" +export _REPOMAP_FRONTEND="$FRONTEND_DIR" +export _REPOMAP_BACKEND="$BACKEND_DIR" +export _REPOMAP_OUTPUT="$OUTPUT_DIR/repo-map.md" +python3 << 'PYEOF' +import sys +import json +import os +import re +import glob + +abi_path = os.environ.get('_REPOMAP_ABI', '') +frontend_dir = os.environ.get('_REPOMAP_FRONTEND', '') +backend_dir = os.environ.get('_REPOMAP_BACKEND', '') +output_path = os.environ['_REPOMAP_OUTPUT'] + +lines = [] +contract_methods = [] +frontend_calls = [] +backend_calls = [] + +# --- Contract Layer --- +lines.append("# Repository Map") +lines.append("") +lines.append("## Contract Layer") +lines.append("") + +if abi_path and os.path.exists(abi_path): + try: + with open(abi_path, 'r') as f: + abi_data = json.load(f) + + # Handle both array format and object format + abi_entries = abi_data if isinstance(abi_data, list) else abi_data.get('abi', abi_data.get('methods', [])) + + methods_section = [] + events_section = [] + storage_section = [] + + if isinstance(abi_entries, list): + for entry in abi_entries: + if not isinstance(entry, dict): + continue + + entry_type = entry.get('type', 'function') + name = entry.get('name', 'unknown') + + if entry_type in ('function', 'method'): + inputs = entry.get('inputs', []) + outputs = entry.get('outputs', []) + input_sig = ", ".join( + "{}: {}".format(inp.get('name', 'arg'), inp.get('type', 'unknown')) + for inp in inputs + ) if isinstance(inputs, list) else "" + output_sig = ", ".join( + "{}".format(out.get('type', 'unknown')) + for out in outputs + ) if isinstance(outputs, list) else "" + + method_line = "- `{}({})` -> `{}`".format(name, input_sig, output_sig if output_sig else "void") + methods_section.append(method_line) + contract_methods.append(name) + + elif entry_type == 'event': + params = entry.get('inputs', entry.get('params', [])) + param_sig = ", ".join( + "{}: {}".format(p.get('name', 'arg'), p.get('type', 'unknown')) + for p in params + ) if isinstance(params, list) else "" + events_section.append("- `{}`({})".format(name, param_sig)) + + elif entry_type == 'storage': + slot = entry.get('slot', 'unknown') + storage_section.append("- `{}` (slot: {})".format(name, slot)) + + if methods_section: + lines.append("### Methods") + lines.append("") + lines.extend(methods_section) + lines.append("") + + if events_section: + lines.append("### Events") + lines.append("") + lines.extend(events_section) + lines.append("") + + if storage_section: + lines.append("### Storage Slots") + lines.append("") + lines.extend(storage_section) + lines.append("") + + if not methods_section and not events_section and not storage_section: + lines.append("*ABI parsed but no recognized entries found.*") + lines.append("") + + except (json.JSONDecodeError, IOError): + lines.append("*ABI file exists but could not be parsed.*") + lines.append("") +else: + lines.append("*No ABI file available. Run contract-dev first.*") + lines.append("") + +# --- Frontend Layer --- +lines.append("## Frontend Layer") +lines.append("") + +if frontend_dir and os.path.isdir(frontend_dir): + # Scan for contract method calls + src_dir = os.path.join(frontend_dir, "src") + search_dir = src_dir if os.path.isdir(src_dir) else frontend_dir + + ts_files = [] + for root, dirs, files in os.walk(search_dir): + for f in files: + if f.endswith(('.ts', '.tsx', '.js', '.jsx')): + ts_files.append(os.path.join(root, f)) + + components = [] + hooks = [] + services = [] + + for fpath in ts_files: + fname = os.path.basename(fpath) + rel_path = os.path.relpath(fpath, frontend_dir) + + if 'component' in rel_path.lower() or fname.endswith(('.tsx', '.jsx')): + components.append(rel_path) + elif 'hook' in rel_path.lower() or fname.startswith('use'): + hooks.append(rel_path) + elif 'service' in rel_path.lower() or 'api' in rel_path.lower(): + services.append(rel_path) + + # Scan for contract method calls + try: + with open(fpath, 'r') as f: + content = f.read() + # Match patterns like contract.methodName( or .methodName( + for match in re.finditer(r'\.(\w+)\s*\(', content): + call_name = match.group(1) + if call_name in contract_methods: + frontend_calls.append(call_name) + except (IOError, UnicodeDecodeError): + pass + + if components: + lines.append("### Components") + for c in components[:20]: + lines.append("- `{}`".format(c)) + lines.append("") + + if hooks: + lines.append("### Hooks") + for h in hooks[:10]: + lines.append("- `{}`".format(h)) + lines.append("") + + if services: + lines.append("### Services") + for s in services[:10]: + lines.append("- `{}`".format(s)) + lines.append("") + + if frontend_calls: + lines.append("### Contract Calls") + for call in sorted(set(frontend_calls)): + lines.append("- `{}`".format(call)) + lines.append("") + + if not components and not hooks and not services: + lines.append("*Frontend directory exists but no recognized source files found.*") + lines.append("") +else: + lines.append("*Not yet populated. Run frontend-dev first.*") + lines.append("") + +# --- Backend Layer --- +lines.append("## Backend Layer") +lines.append("") + +if backend_dir and os.path.isdir(backend_dir): + src_dir = os.path.join(backend_dir, "src") + search_dir = src_dir if os.path.isdir(src_dir) else backend_dir + + ts_files = [] + for root, dirs, files in os.walk(search_dir): + for f in files: + if f.endswith(('.ts', '.js')): + ts_files.append(os.path.join(root, f)) + + routes = [] + services = [] + + for fpath in ts_files: + fname = os.path.basename(fpath) + rel_path = os.path.relpath(fpath, backend_dir) + + if 'route' in rel_path.lower(): + routes.append(rel_path) + elif 'service' in rel_path.lower(): + services.append(rel_path) + + # Scan for contract method calls + try: + with open(fpath, 'r') as f: + content = f.read() + for match in re.finditer(r'\.(\w+)\s*\(', content): + call_name = match.group(1) + if call_name in contract_methods: + backend_calls.append(call_name) + except (IOError, UnicodeDecodeError): + pass + + if routes: + lines.append("### Routes") + for r in routes[:20]: + lines.append("- `{}`".format(r)) + lines.append("") + + if services: + lines.append("### Services") + for s in services[:10]: + lines.append("- `{}`".format(s)) + lines.append("") + + if backend_calls: + lines.append("### Contract Calls") + for call in sorted(set(backend_calls)): + lines.append("- `{}`".format(call)) + lines.append("") + + if not routes and not services: + lines.append("*Backend directory exists but no recognized source files found.*") + lines.append("") +else: + lines.append("*Not yet populated. Run backend-dev first.*") + lines.append("") + +# --- Cross-Layer Integrity Checks --- +lines.append("## Cross-Layer Integrity Checks") +lines.append("") + +all_calls = set(frontend_calls + backend_calls) +contract_set = set(contract_methods) + +if contract_methods: + # Missing methods: called but not in ABI + missing = sorted(all_calls - contract_set) + if missing: + lines.append("### Missing Methods (called but not in ABI)") + for m in missing: + callers = [] + if m in frontend_calls: + callers.append("frontend") + if m in backend_calls: + callers.append("backend") + lines.append("- `{}` (called by: {})".format(m, ", ".join(callers))) + lines.append("") + + # Uncalled methods: in ABI but never called + uncalled = sorted(contract_set - all_calls) + if uncalled and (frontend_dir or backend_dir): + lines.append("### Uncalled Methods (in ABI but never called)") + for m in uncalled: + lines.append("- `{}`".format(m)) + lines.append("") + + if not missing and not uncalled: + lines.append("*All contract methods are called. No missing or extra calls detected.*") + lines.append("") +else: + lines.append("*No contract ABI available for integrity checks.*") + lines.append("") + +# Truncate to 300 lines +if len(lines) > 300: + lines = lines[:297] + lines.append("") + lines.append("*... truncated to 300 lines ...*") + lines.append("") + +with open(output_path, 'w') as f: + f.write('\n'.join(lines)) + f.write('\n') + +print("Repo map written to {} ({} lines)".format(output_path, len(lines))) +PYEOF diff --git a/scripts/extract-requirements.sh b/scripts/extract-requirements.sh new file mode 100755 index 0000000..cf39437 --- /dev/null +++ b/scripts/extract-requirements.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# extract-requirements.sh — Extract structured requirements from requirements.md +# +# Usage: bash scripts/extract-requirements.sh +# +# Parses a requirements.md file and extracts individual requirements into +# a structured YAML format for goal-oriented evaluation. +# +# Output: artifacts/evaluation/spec-requirements.yaml +# requirements: +# - id: REQ-1 +# description: "..." +# has_test: false +# priority: must +# +# Exit codes: +# 0 — Success +# 1 — Missing arguments or file not found + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +REQUIREMENTS_PATH="${1:-}" + +if [[ -z "$REQUIREMENTS_PATH" ]]; then + echo "Usage: bash scripts/extract-requirements.sh " + exit 1 +fi + +if [[ ! -f "$REQUIREMENTS_PATH" ]]; then + echo "Error: Requirements file not found: $REQUIREMENTS_PATH" + exit 1 +fi + +OUTPUT_DIR="${SCRIPT_DIR}/artifacts/evaluation" +mkdir -p "$OUTPUT_DIR" + +export _EXTREQ_INPUT="$REQUIREMENTS_PATH" +export _EXTREQ_OUTPUT="$OUTPUT_DIR/spec-requirements.yaml" +python3 << 'PYEOF' +import sys +import re +import os + +req_path = os.environ['_EXTREQ_INPUT'] +output_path = os.environ['_EXTREQ_OUTPUT'] + +try: + with open(req_path, 'r') as f: + content = f.read() +except (IOError, OSError): + content = "" + +requirements = [] +req_id = 0 + +if content.strip(): + lines = content.split('\n') + + for line in lines: + stripped = line.strip() + if not stripped: + continue + + # Match requirement patterns: + # - [ ] REQ-N: description + # - REQ-N: description + # - Numbered list: 1. description + # - Bullet: - description (that looks like a requirement) + # - **Must**: description + + req_match = re.match(r'^[-*]\s*\[[ x]\]\s*(REQ-\d+)[:\s]+(.+)', stripped) + if not req_match: + req_match = re.match(r'^(REQ-\d+)[:\s]+(.+)', stripped) + if not req_match: + req_match = re.match(r'^\d+\.\s+(.+)', stripped) + if req_match: + req_id += 1 + desc = req_match.group(1).strip() + req_name = "REQ-{}".format(req_id) + # Detect priority + priority = "must" + desc_lower = desc.lower() + if "should" in desc_lower or "nice to have" in desc_lower: + priority = "should" + elif "could" in desc_lower or "optional" in desc_lower: + priority = "could" + + requirements.append({ + 'id': req_name, + 'description': desc, + 'has_test': False, + 'priority': priority, + }) + continue + + if req_match and len(req_match.groups()) >= 2: + req_name = req_match.group(1) + desc = req_match.group(2).strip() + priority = "must" + desc_lower = desc.lower() + if "should" in desc_lower or "nice to have" in desc_lower: + priority = "should" + elif "could" in desc_lower or "optional" in desc_lower: + priority = "could" + + requirements.append({ + 'id': req_name, + 'description': desc, + 'has_test': False, + 'priority': priority, + }) + continue + + # Match bullet points that contain requirement-like language + bullet_match = re.match(r'^[-*]\s+\*\*(Must|Should|Could)\*\*[:\s]+(.+)', stripped) + if bullet_match: + req_id += 1 + priority = bullet_match.group(1).lower() + desc = bullet_match.group(2).strip() + requirements.append({ + 'id': "REQ-{}".format(req_id), + 'description': desc, + 'has_test': False, + 'priority': priority, + }) + +# Write YAML output manually (no PyYAML dependency) +with open(output_path, 'w') as f: + f.write("requirements:\n") + if not requirements: + f.write(" []\n") + else: + for req in requirements: + f.write(" - id: \"{}\"\n".format(req['id'])) + # Escape quotes in description + safe_desc = req['description'].replace('"', '\\"') + f.write(" description: \"{}\"\n".format(safe_desc)) + f.write(" has_test: {}\n".format(str(req['has_test']).lower())) + f.write(" priority: \"{}\"\n".format(req['priority'])) + +print("Extracted {} requirements to {}".format(len(requirements), output_path)) +PYEOF diff --git a/scripts/localize-failure.sh b/scripts/localize-failure.sh new file mode 100755 index 0000000..e21799f --- /dev/null +++ b/scripts/localize-failure.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# localize-failure.sh — Failure localization from build/test logs +# +# Usage: bash scripts/localize-failure.sh +# +# Parses a failure log and extracts structured localization data: +# - File and function where the failure occurred +# - Line range of the suspected cause +# - Confidence level (high/medium/low) +# - Failure category (compile_error, test_failure, runtime_error, type_error, lint_error) +# +# Output: artifacts/localization.json +# { file, function, line_range, suspected_cause, confidence, failure_category } +# +# Exit codes: +# 0 — Success (localization written) +# 1 — Missing arguments or file not found + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +FAILURE_LOG="${1:-}" + +if [[ -z "$FAILURE_LOG" ]]; then + echo "Usage: bash scripts/localize-failure.sh " + exit 1 +fi + +if [[ ! -f "$FAILURE_LOG" ]]; then + echo "Error: Failure log not found: $FAILURE_LOG" + exit 1 +fi + +OUTPUT_DIR="${SCRIPT_DIR}/artifacts" +mkdir -p "$OUTPUT_DIR" + +# Use Python to parse the failure log and extract localization +export _LOC_LOG_PATH="$FAILURE_LOG" +export _LOC_OUTPUT_PATH="$OUTPUT_DIR/localization.json" +python3 << 'PYEOF' +import sys +import json +import re +import os + +log_path = os.environ['_LOC_LOG_PATH'] +output_path = os.environ['_LOC_OUTPUT_PATH'] + +# Read the failure log +try: + with open(log_path, 'r') as f: + content = f.read() +except (IOError, OSError): + content = "" + +if not content.strip(): + # Empty log — write minimal localization + result = { + "file": "unknown", + "function": "unknown", + "line_range": [0, 0], + "suspected_cause": "Empty failure log — no information available", + "confidence": "low", + "failure_category": "unknown" + } + with open(output_path, 'w') as f: + json.dump(result, f, indent=2) + f.write('\n') + print(json.dumps(result, indent=2)) + sys.exit(0) + +# Patterns for file:line extraction +file_line_patterns = [ + # TypeScript/JavaScript errors: file.ts(line,col) or file.ts:line:col + r'([^\s:]+\.(?:ts|js|as))[:\(](\d+)', + # Rust-style: --> file.rs:line:col + r'-->\s+([^\s:]+):(\d+)', + # Generic: at file:line + r'at\s+([^\s:]+):(\d+)', + # Error in file:line + r'(?:Error|error|ERROR)\s+(?:in\s+)?([^\s:]+):(\d+)', +] + +# Patterns for function names +function_patterns = [ + r'(?:function|method|fn)\s+(\w+)', + r'(\w+)\s*\(', + r'at\s+(\w+)\s+\(', + r'in\s+(\w+)\s+at', +] + +# Failure category detection +category_keywords = { + 'compile_error': ['compile', 'compilation', 'syntax error', 'cannot find', 'build failed', + 'TS\d+', 'AS\d+', 'unexpected token', 'parse error'], + 'test_failure': ['test failed', 'assertion', 'expect', 'FAIL', 'test.*error', + 'AssertionError', 'should.*but', 'expected.*received'], + 'runtime_error': ['runtime', 'uncaught', 'ReferenceError', 'TypeError', + 'null pointer', 'segfault', 'panic', 'abort'], + 'type_error': ['type.*mismatch', 'type.*error', 'cannot assign', 'incompatible', + 'TS2\d+', 'not assignable'], + 'lint_error': ['lint', 'eslint', 'warning.*unused', 'no-unused', 'prettier'], +} + +# Extract file and line +file_found = "unknown" +line_found = 0 +for pattern in file_line_patterns: + match = re.search(pattern, content) + if match: + file_found = match.group(1) + line_found = int(match.group(2)) + break + +# Extract function name +function_found = "unknown" +for pattern in function_patterns: + match = re.search(pattern, content) + if match: + candidate = match.group(1) + # Filter out common non-function matches + if candidate not in ('if', 'for', 'while', 'return', 'throw', 'new', 'import', 'from'): + function_found = candidate + break + +# Detect category +category = "unknown" +for cat, keywords in category_keywords.items(): + for kw in keywords: + if re.search(kw, content, re.IGNORECASE): + category = cat + break + if category != "unknown": + break + +# Extract suspected cause (first error-like line) +cause_patterns = [ + r'(?:error|Error|ERROR)[:\s]+(.+?)(?:\n|$)', + r'(?:FAIL|FAILED)[:\s]+(.+?)(?:\n|$)', + r'(?:assert|Assert)[:\s]+(.+?)(?:\n|$)', +] +suspected_cause = "Could not determine cause from log" +for pattern in cause_patterns: + match = re.search(pattern, content) + if match: + suspected_cause = match.group(1).strip()[:200] + break + +# Determine confidence +confidence = "low" +if file_found != "unknown" and line_found > 0: + confidence = "high" +elif file_found != "unknown" or category != "unknown": + confidence = "medium" + +# Compute line range (10 lines around the error) +line_start = max(1, line_found - 5) +line_end = line_found + 5 + +result = { + "file": file_found, + "function": function_found, + "line_range": [line_start, line_end], + "suspected_cause": suspected_cause, + "confidence": confidence, + "failure_category": category +} + +with open(output_path, 'w') as f: + json.dump(result, f, indent=2) + f.write('\n') + +print(json.dumps(result, indent=2)) +PYEOF diff --git a/scripts/mutate-contract.sh b/scripts/mutate-contract.sh new file mode 100755 index 0000000..7f32859 --- /dev/null +++ b/scripts/mutate-contract.sh @@ -0,0 +1,235 @@ +#!/bin/bash +# mutate-contract.sh — Mutation testing for contract source files +# +# Usage: bash scripts/mutate-contract.sh +# +# Applies 20 sed-level mutation operators to the contract source, one at a time. +# For each mutant: creates a temp copy, applies the mutation, compiles, runs tests. +# If tests fail (mutant killed) or tests pass (mutant survived). +# Compile errors count as "survived" (untested code path). +# +# Output: artifacts/testing/mutation-score.json +# { total_mutants, killed, survived, compile_errors, +# mutation_score (0-1), threshold (0.70), verdict (PASS/FAIL), +# survivors[] } +# +# Exit codes: +# 0 — Success (output written regardless of verdict) +# 1 — Missing arguments or file not found + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +CONTRACT_SRC="${1:-}" +TEST_DIR="${2:-}" + +if [[ -z "$CONTRACT_SRC" || -z "$TEST_DIR" ]]; then + echo "Usage: bash scripts/mutate-contract.sh " + echo " contract-src-path: path to the contract source file (.ts)" + echo " test-dir: path to the test directory" + exit 1 +fi + +if [[ ! -f "$CONTRACT_SRC" ]]; then + echo "Error: Contract source file not found: $CONTRACT_SRC" + exit 1 +fi + +if [[ ! -d "$TEST_DIR" ]]; then + echo "Error: Test directory not found: $TEST_DIR" + exit 1 +fi + +# Output directory +OUTPUT_DIR="${SCRIPT_DIR}/artifacts/testing" +mkdir -p "$OUTPUT_DIR" + +# Working directory for mutants +MUTANT_DIR=$(mktemp -d) +cleanup() { + if [[ -n "${SOURCE_CONTENT:-}" && -n "${CONTRACT_SRC:-}" && -f "$CONTRACT_SRC" ]]; then + echo "$SOURCE_CONTENT" > "$CONTRACT_SRC" + fi + rm -rf "$MUTANT_DIR" +} +trap cleanup EXIT + +# Contract directory (for build context) +CONTRACT_DIR="$(dirname "$CONTRACT_SRC")" +CONTRACT_FILENAME="$(basename "$CONTRACT_SRC")" + +TOTAL=0 +KILLED=0 +SURVIVED=0 +COMPILE_ERRORS=0 +SURVIVORS_JSON="[]" + +# 20 mutation operators (sed patterns) +# Each entry: "name|sed-pattern" +OPERATORS=( + "arith-add-to-sub|s/SafeMath\.add/SafeMath.sub/g" + "arith-sub-to-add|s/SafeMath\.sub/SafeMath.add/g" + "arith-mul-to-div|s/SafeMath\.mul/SafeMath.div/g" + "arith-div-to-mul|s/SafeMath\.div/SafeMath.mul/g" + "compare-eq-to-neq|s/==/!=/g" + "compare-neq-to-eq|s/!=/==/g" + "compare-gt-to-lt|s/> /< /g" + "compare-lt-to-gt|s/< /> /g" + "compare-gte-to-lte|s/>=/<=/g" + "compare-lte-to-gte|s/<=/>=/g" + "bool-true-to-false|s/return true/return false/g" + "bool-false-to-true|s/return false/return true/g" + "logic-and-to-or|s/&&/||/g" + "logic-or-to-and|s/||/\&\&/g" + "negate-condition|s/if (/if (!/g" + "remove-revert|s/Revert(/\/\/ Revert(/g" + "zero-constant|s/u256\.One/u256.Zero/g" + "one-constant|s/u256\.Zero/u256.One/g" + "remove-event|s/this\.emitEvent/\/\/ this.emitEvent/g" + "swap-args|s/\(a, b\)/(b, a)/g" +) + +# Check if there are any test files +TEST_COUNT=$(find "$TEST_DIR" -name "*.test.*" -o -name "*.spec.*" 2>/dev/null | wc -l | tr -d ' ') +if [[ "$TEST_COUNT" -eq 0 ]]; then + TEST_COUNT=$(find "$TEST_DIR" -name "*.ts" -o -name "*.js" 2>/dev/null | wc -l | tr -d ' ') +fi + +# Read the source file +SOURCE_CONTENT=$(cat "$CONTRACT_SRC") + +for entry in "${OPERATORS[@]}"; do + OP_NAME="${entry%%|*}" + SED_PATTERN="${entry#*|}" + + # Apply mutation + MUTATED=$(echo "$SOURCE_CONTENT" | sed "$SED_PATTERN" 2>/dev/null || echo "$SOURCE_CONTENT") + + # Skip if mutation had no effect + if [[ "$MUTATED" == "$SOURCE_CONTENT" ]]; then + continue + fi + + TOTAL=$((TOTAL + 1)) + + # Write mutant + MUTANT_FILE="${MUTANT_DIR}/${CONTRACT_FILENAME}" + echo "$MUTATED" > "$MUTANT_FILE" + + # Try to compile (check if npm run build exists in the contract dir) + BUILD_CMD="" + if [[ -f "${CONTRACT_DIR}/package.json" ]]; then + BUILD_CMD="cd ${CONTRACT_DIR} && cp ${MUTANT_FILE} ${CONTRACT_SRC} && npm run build 2>&1" + elif [[ -f "${CONTRACT_DIR}/../package.json" ]]; then + BUILD_CMD="cd ${CONTRACT_DIR}/.. && cp ${MUTANT_FILE} ${CONTRACT_SRC} && npm run build 2>&1" + fi + + COMPILE_OK=true + if [[ -n "$BUILD_CMD" ]]; then + if ! eval "$BUILD_CMD" >/dev/null 2>&1; then + COMPILE_OK=false + COMPILE_ERRORS=$((COMPILE_ERRORS + 1)) + SURVIVED=$((SURVIVED + 1)) + SURVIVORS_JSON=$(python3 -c " +import json, sys +survivors = json.loads(sys.argv[1]) +survivors.append({ + 'operator': sys.argv[2], + 'reason': 'compile_error', + 'file': sys.argv[3] +}) +print(json.dumps(survivors)) +" "$SURVIVORS_JSON" "$OP_NAME" "$CONTRACT_SRC") + # Restore original + echo "$SOURCE_CONTENT" > "$CONTRACT_SRC" + continue + fi + fi + + # Run tests + TEST_CMD="" + if [[ -f "${CONTRACT_DIR}/package.json" ]]; then + TEST_CMD="cd ${CONTRACT_DIR} && npm test 2>&1" + elif [[ -f "${CONTRACT_DIR}/../package.json" ]]; then + TEST_CMD="cd ${CONTRACT_DIR}/.. && npm test 2>&1" + else + # No package.json - try running test files directly + TEST_CMD="cd ${TEST_DIR} && ls *.test.* *.spec.* 2>/dev/null && echo 'test-files-found'" + fi + + if [[ -n "$TEST_CMD" ]]; then + if eval "$TEST_CMD" >/dev/null 2>&1; then + # Tests passed — mutant survived (bad) + SURVIVED=$((SURVIVED + 1)) + SURVIVORS_JSON=$(python3 -c " +import json, sys +survivors = json.loads(sys.argv[1]) +survivors.append({ + 'operator': sys.argv[2], + 'reason': 'tests_passed', + 'file': sys.argv[3] +}) +print(json.dumps(survivors)) +" "$SURVIVORS_JSON" "$OP_NAME" "$CONTRACT_SRC") + else + # Tests failed — mutant killed (good) + KILLED=$((KILLED + 1)) + fi + else + # No test runner available — count as survived + SURVIVED=$((SURVIVED + 1)) + SURVIVORS_JSON=$(python3 -c " +import json, sys +survivors = json.loads(sys.argv[1]) +survivors.append({ + 'operator': sys.argv[2], + 'reason': 'no_test_runner', + 'file': sys.argv[3] +}) +print(json.dumps(survivors)) +" "$SURVIVORS_JSON" "$OP_NAME" "$CONTRACT_SRC") + fi + + # Restore original source + echo "$SOURCE_CONTENT" > "$CONTRACT_SRC" +done + +# Calculate mutation score +if [[ $TOTAL -eq 0 ]]; then + MUTATION_SCORE=0 +else + MUTATION_SCORE=$(python3 -c "print(round($KILLED / $TOTAL, 4))") +fi + +# Determine verdict +THRESHOLD="0.70" +if python3 -c "exit(0 if $MUTATION_SCORE >= $THRESHOLD else 1)"; then + VERDICT="PASS" +else + VERDICT="FAIL" +fi + +# Write output +python3 -c " +import json, sys + +result = { + 'total_mutants': int(sys.argv[1]), + 'killed': int(sys.argv[2]), + 'survived': int(sys.argv[3]), + 'compile_errors': int(sys.argv[4]), + 'mutation_score': float(sys.argv[5]), + 'threshold': float(sys.argv[6]), + 'verdict': sys.argv[7], + 'survivors': json.loads(sys.argv[8]) +} + +with open(sys.argv[9], 'w') as f: + json.dump(result, f, indent=2) + f.write('\n') + +print(json.dumps(result, indent=2)) +" "$TOTAL" "$KILLED" "$SURVIVED" "$COMPILE_ERRORS" "$MUTATION_SCORE" "$THRESHOLD" "$VERDICT" "$SURVIVORS_JSON" "${OUTPUT_DIR}/mutation-score.json" + +echo "" +echo "Mutation testing complete: $KILLED/$TOTAL killed (score: $MUTATION_SCORE, verdict: $VERDICT)" diff --git a/scripts/score-build.sh b/scripts/score-build.sh new file mode 100755 index 0000000..1e605db --- /dev/null +++ b/scripts/score-build.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# score-build.sh — Goal-oriented build evaluation across 4 dimensions +# +# Usage: bash scripts/score-build.sh +# +# Evaluates the current build across 4 dimensions: +# 1. spec_coverage (0-100%): requirements with tests / total requirements +# 2. security_delta (integer): new open findings count (0 = no regression) +# 3. mutation_score (0-100%): from mutation-score.json +# 4. code_health (0-100%): 100 - (weighted_penalties * 5), floor 0 +# +# Thresholds: spec >= 90%, security <= 0, mutation >= 70%, health >= 60% +# +# Output: artifacts/evaluation/progress-tracker.yaml +# +# Exit codes: +# 0 — Success +# 1 — Error during evaluation + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" + +OUTPUT_DIR="${SCRIPT_DIR}/artifacts/evaluation" +mkdir -p "$OUTPUT_DIR" + +export _SCOREBUILD_ROOT="$SCRIPT_DIR" +export _SCOREBUILD_OUTPUT="$OUTPUT_DIR/progress-tracker.yaml" +python3 << 'PYEOF' +import sys +import json +import os +import re + +script_dir = os.environ['_SCOREBUILD_ROOT'] +output_path = os.environ['_SCOREBUILD_OUTPUT'] + +# --- Dimension 1: spec_coverage --- +spec_coverage = 0 +total_reqs = 0 +tested_reqs = 0 + +spec_req_path = os.path.join(script_dir, "artifacts", "evaluation", "spec-requirements.yaml") +if os.path.exists(spec_req_path): + with open(spec_req_path, 'r') as f: + content = f.read() + # Count requirements + req_lines = re.findall(r'^\s+- id:', content, re.MULTILINE) + total_reqs = len(req_lines) + # Count those with has_test: true + tested_lines = re.findall(r'has_test:\s*true', content) + tested_reqs = len(tested_lines) + if total_reqs > 0: + spec_coverage = round((tested_reqs / total_reqs) * 100) + +# --- Dimension 2: security_delta --- +security_delta = 0 + +findings_path = os.path.join(script_dir, "artifacts", "findings-ledger.md") +if os.path.exists(findings_path): + with open(findings_path, 'r') as f: + content = f.read() + # Count OPEN findings + open_count = len(re.findall(r'\|\s*OPEN\s*\|', content)) + security_delta = open_count + +# --- Dimension 3: mutation_score --- +mutation_pct = 0 + +mutation_path = os.path.join(script_dir, "artifacts", "testing", "mutation-score.json") +if os.path.exists(mutation_path): + try: + with open(mutation_path, 'r') as f: + mutation_data = json.load(f) + mutation_pct = round(mutation_data.get('mutation_score', 0) * 100) + except (json.JSONDecodeError, KeyError): + mutation_pct = 0 + +# --- Dimension 4: code_health --- +# Penalties: lint errors, type errors, build warnings +weighted_penalties = 0 + +# Check for build result +for build_result_path in [ + os.path.join(script_dir, "artifacts", "contract", "build-result.json"), + os.path.join(script_dir, "artifacts", "frontend", "build-result.json"), + os.path.join(script_dir, "artifacts", "backend", "build-result.json"), +]: + if os.path.exists(build_result_path): + try: + with open(build_result_path, 'r') as f: + build_data = json.load(f) + if build_data.get('status') != 'success': + weighted_penalties += 5 + warnings = build_data.get('warnings', 0) + if isinstance(warnings, int): + weighted_penalties += warnings + except (json.JSONDecodeError, KeyError): + weighted_penalties += 2 + +# Check findings for code quality issues +if os.path.exists(findings_path): + with open(findings_path, 'r') as f: + content = f.read() + # Count convention/nit findings as minor penalties + minor_count = len(re.findall(r'\|\s*(?:OPEN|REGRESSION)\s*\|', content)) + weighted_penalties += minor_count + +code_health = max(0, 100 - (weighted_penalties * 5)) + +# --- Thresholds --- +spec_threshold = 90 +security_threshold = 0 +mutation_threshold = 70 +health_threshold = 60 + +spec_pass = spec_coverage >= spec_threshold +security_pass = security_delta <= security_threshold +mutation_pass = mutation_pct >= mutation_threshold +health_pass = code_health >= health_threshold + +all_pass = spec_pass and security_pass and mutation_pass and health_pass +overall_verdict = "PASS" if all_pass else "FAIL" + +# Failed dimensions +failed_dims = [] +if not spec_pass: + failed_dims.append("spec_coverage") +if not security_pass: + failed_dims.append("security_delta") +if not mutation_pass: + failed_dims.append("mutation_score") +if not health_pass: + failed_dims.append("code_health") + +# Write YAML output +with open(output_path, 'w') as f: + f.write("dimensions:\n") + f.write(" spec_coverage:\n") + f.write(" score: {}\n".format(spec_coverage)) + f.write(" threshold: {}\n".format(spec_threshold)) + f.write(" pass: {}\n".format(str(spec_pass).lower())) + f.write(" detail: \"{}/{} requirements with tests\"\n".format(tested_reqs, total_reqs)) + f.write(" security_delta:\n") + f.write(" score: {}\n".format(security_delta)) + f.write(" threshold: {}\n".format(security_threshold)) + f.write(" pass: {}\n".format(str(security_pass).lower())) + f.write(" detail: \"{} open findings\"\n".format(security_delta)) + f.write(" mutation_score:\n") + f.write(" score: {}\n".format(mutation_pct)) + f.write(" threshold: {}\n".format(mutation_threshold)) + f.write(" pass: {}\n".format(str(mutation_pass).lower())) + f.write(" detail: \"{}% mutants killed\"\n".format(mutation_pct)) + f.write(" code_health:\n") + f.write(" score: {}\n".format(code_health)) + f.write(" threshold: {}\n".format(health_threshold)) + f.write(" pass: {}\n".format(str(health_pass).lower())) + f.write(" detail: \"{} weighted penalties\"\n".format(weighted_penalties)) + f.write("overall_verdict: \"{}\"\n".format(overall_verdict)) + f.write("failed_dimensions:\n") + if failed_dims: + for dim in failed_dims: + f.write(" - \"{}\"\n".format(dim)) + else: + f.write(" []\n") + +# Print compact table +print("Build Score Card") +print("+" + "-"*20 + "+" + "-"*8 + "+" + "-"*10 + "+" + "-"*6 + "+") +print("| {:<18} | {:<6} | {:<8} | {:<4} |".format("Dimension", "Score", "Thresh", "Pass")) +print("+" + "-"*20 + "+" + "-"*8 + "+" + "-"*10 + "+" + "-"*6 + "+") +print("| {:<18} | {:<6} | {:<8} | {:<4} |".format("spec_coverage", "{}%".format(spec_coverage), ">={}%".format(spec_threshold), "Y" if spec_pass else "N")) +print("| {:<18} | {:<6} | {:<8} | {:<4} |".format("security_delta", str(security_delta), "<={}".format(security_threshold), "Y" if security_pass else "N")) +print("| {:<18} | {:<6} | {:<8} | {:<4} |".format("mutation_score", "{}%".format(mutation_pct), ">={}%".format(mutation_threshold), "Y" if mutation_pass else "N")) +print("| {:<18} | {:<6} | {:<8} | {:<4} |".format("code_health", "{}%".format(code_health), ">={}%".format(health_threshold), "Y" if health_pass else "N")) +print("+" + "-"*20 + "+" + "-"*8 + "+" + "-"*10 + "+" + "-"*6 + "+") +print("Overall: {}".format(overall_verdict)) +if failed_dims: + print("Failed: {}".format(", ".join(failed_dims))) +PYEOF diff --git a/tests/plugin-tests.sh b/tests/plugin-tests.sh index eee6256..743e441 100755 --- a/tests/plugin-tests.sh +++ b/tests/plugin-tests.sh @@ -1810,30 +1810,30 @@ fi rm -rf "$QUERY_TMPDIR" # ───────────────────────────────────────────────── -# Version 6.0.0 Consistency (TEST-13) +# Version 7.0.0 Consistency # ───────────────────────────────────────────────── echo "" -echo "=== Version 6.0.0 ===" +echo "=== Version 7.0.0 ===" -V6_PLUGIN=$(python3 -c "import json; print(json.load(open('.claude-plugin/plugin.json'))['version'])" 2>/dev/null) -V6_CHANGELOG=$(head -5 CHANGELOG.md | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1 || true) +V7_PLUGIN=$(python3 -c "import json; print(json.load(open('.claude-plugin/plugin.json'))['version'])" 2>/dev/null) +V7_CHANGELOG=$(head -5 CHANGELOG.md | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1 || true) -if [[ "$V6_PLUGIN" == "6.0.0" ]]; then - pass "v6-plugin-json-version: plugin.json version is 6.0.0" +if [[ "$V7_PLUGIN" == "7.0.0" ]]; then + pass "v7-plugin-json-version: plugin.json version is 7.0.0" else - fail "v6-plugin-json-version: plugin.json version is NOT 6.0.0 (got: $V6_PLUGIN)" + fail "v7-plugin-json-version: plugin.json version is NOT 7.0.0 (got: $V7_PLUGIN)" fi -if [[ "$V6_CHANGELOG" == "6.0.0" ]]; then - pass "v6-changelog-first-entry: CHANGELOG first entry is 6.0.0" +if [[ "$V7_CHANGELOG" == "7.0.0" ]]; then + pass "v7-changelog-first-entry: CHANGELOG first entry is 7.0.0" else - fail "v6-changelog-first-entry: CHANGELOG first entry is NOT 6.0.0 (got: $V6_CHANGELOG)" + fail "v7-changelog-first-entry: CHANGELOG first entry is NOT 7.0.0 (got: $V7_CHANGELOG)" fi -if [[ "$V6_PLUGIN" == "$V6_CHANGELOG" ]]; then - pass "v6-version-consistency: plugin.json version matches CHANGELOG first entry" +if [[ "$V7_PLUGIN" == "$V7_CHANGELOG" ]]; then + pass "v7-version-consistency: plugin.json version matches CHANGELOG first entry" else - fail "v6-version-consistency: plugin.json ($V6_PLUGIN) does NOT match CHANGELOG ($V6_CHANGELOG)" + fail "v7-version-consistency: plugin.json ($V7_PLUGIN) does NOT match CHANGELOG ($V7_CHANGELOG)" fi # ───────────────────────────────────────────────── @@ -2845,6 +2845,800 @@ fi cp "$STALE_PATTERNS_BACKUP" learning/patterns.yaml rm -rf "$STALE_TMPDIR" +# ───────────────────────────────────────────────── +# v7 TEST-1: Mutation Testing Script +# ───────────────────────────────────────────────── +echo "" +echo "=== Mutation Testing Script ===" + +if [[ -f scripts/mutate-contract.sh ]]; then + pass "v7-mutation-script-exists: scripts/mutate-contract.sh exists" +else + fail "v7-mutation-script-exists: scripts/mutate-contract.sh NOT found" +fi + +if [[ -x scripts/mutate-contract.sh ]]; then + pass "v7-mutation-script-executable: scripts/mutate-contract.sh is executable" +else + fail "v7-mutation-script-executable: scripts/mutate-contract.sh is NOT executable" +fi + +check "v7-mutation-script-syntax: mutate-contract.sh passes bash -n" bash -n scripts/mutate-contract.sh + +if head -1 scripts/mutate-contract.sh | grep -q '#!/bin/bash'; then + pass "v7-mutation-script-shebang: mutate-contract.sh has bash shebang" +else + fail "v7-mutation-script-shebang: mutate-contract.sh missing bash shebang" +fi + +if grep -q 'set -euo pipefail' scripts/mutate-contract.sh; then + pass "v7-mutation-script-pipefail: mutate-contract.sh has set -euo pipefail" +else + fail "v7-mutation-script-pipefail: mutate-contract.sh missing set -euo pipefail" +fi + +if grep -q 'SCRIPT_DIR=' scripts/mutate-contract.sh; then + pass "v7-mutation-script-dir: mutate-contract.sh has SCRIPT_DIR" +else + fail "v7-mutation-script-dir: mutate-contract.sh missing SCRIPT_DIR" +fi + +# Verify 20 mutation operators exist +OPERATOR_COUNT=$(grep -c 'arith-\|compare-\|bool-\|logic-\|negate-\|remove-\|zero-\|one-\|swap-' scripts/mutate-contract.sh || true) +if [[ "$OPERATOR_COUNT" -ge 20 ]]; then + pass "v7-mutation-20-operators: mutate-contract.sh has >= 20 mutation operators ($OPERATOR_COUNT found)" +else + fail "v7-mutation-20-operators: mutate-contract.sh has < 20 mutation operators ($OPERATOR_COUNT found)" +fi + +if grep -q 'mutation-score.json' scripts/mutate-contract.sh; then + pass "v7-mutation-output-file: mutate-contract.sh outputs mutation-score.json" +else + fail "v7-mutation-output-file: mutate-contract.sh does NOT output mutation-score.json" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-2: Mutation Script Functional Test +# ───────────────────────────────────────────────── +echo "" +echo "=== Mutation Script Functional ===" + +MUTATION_TMPDIR=$(mktemp -d) +mkdir -p "$MUTATION_TMPDIR/src" +mkdir -p "$MUTATION_TMPDIR/tests" +echo 'export class Token { }' > "$MUTATION_TMPDIR/src/contract.ts" + +# Run with empty test dir — should produce valid JSON with score 0 +MUTATION_OUT=$(bash scripts/mutate-contract.sh "$MUTATION_TMPDIR/src/contract.ts" "$MUTATION_TMPDIR/tests" 2>&1 || true) + +if [[ -f artifacts/testing/mutation-score.json ]]; then + pass "v7-mutation-func-creates-json: mutate-contract.sh creates mutation-score.json" + + # Check JSON structure using python3 + MUTATION_SCORE=$(python3 -c "import json; d=json.load(open('artifacts/testing/mutation-score.json')); print(d.get('mutation_score', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + MUTATION_VERDICT=$(python3 -c "import json; d=json.load(open('artifacts/testing/mutation-score.json')); print(d.get('verdict', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + MUTATION_THRESHOLD=$(python3 -c "import json; d=json.load(open('artifacts/testing/mutation-score.json')); print(type(d.get('threshold')).__name__)" 2>/dev/null || echo "PARSE_ERROR") + + if [[ "$MUTATION_SCORE" == "0" || "$MUTATION_SCORE" == "0.0" ]]; then + pass "v7-mutation-func-score-zero: mutation_score is 0 for empty test dir" + else + fail "v7-mutation-func-score-zero: mutation_score is NOT 0 (got: $MUTATION_SCORE)" + fi + + if [[ "$MUTATION_VERDICT" == "FAIL" ]]; then + pass "v7-mutation-func-verdict-fail: verdict is FAIL for score 0" + else + fail "v7-mutation-func-verdict-fail: verdict is NOT FAIL (got: $MUTATION_VERDICT)" + fi + + if [[ "$MUTATION_THRESHOLD" == "float" ]]; then + pass "v7-mutation-func-threshold-type: threshold is a number (float), not string" + else + fail "v7-mutation-func-threshold-type: threshold is NOT a number (got type: $MUTATION_THRESHOLD)" + fi +else + fail "v7-mutation-func-creates-json: mutation-score.json NOT created" + fail "v7-mutation-func-score-zero: (skipped — no JSON)" + fail "v7-mutation-func-verdict-fail: (skipped — no JSON)" + fail "v7-mutation-func-threshold-type: (skipped — no JSON)" +fi + +rm -rf "$MUTATION_TMPDIR" +rm -f artifacts/testing/mutation-score.json + +# ───────────────────────────────────────────────── +# v7 TEST-3: Localize Failure Script +# ───────────────────────────────────────────────── +echo "" +echo "=== Localize Failure Script ===" + +if [[ -f scripts/localize-failure.sh ]]; then + pass "v7-localize-script-exists: scripts/localize-failure.sh exists" +else + fail "v7-localize-script-exists: scripts/localize-failure.sh NOT found" +fi + +if [[ -x scripts/localize-failure.sh ]]; then + pass "v7-localize-script-executable: scripts/localize-failure.sh is executable" +else + fail "v7-localize-script-executable: scripts/localize-failure.sh is NOT executable" +fi + +check "v7-localize-script-syntax: localize-failure.sh passes bash -n" bash -n scripts/localize-failure.sh + +if grep -q 'localization.json' scripts/localize-failure.sh; then + pass "v7-localize-output-file: localize-failure.sh outputs localization.json" +else + fail "v7-localize-output-file: localize-failure.sh does NOT output localization.json" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-4: Localize Failure Functional Test +# ───────────────────────────────────────────────── +echo "" +echo "=== Localize Failure Functional ===" + +LOCALIZE_TMPDIR=$(mktemp -d) + +# Test with a sample error log +cat > "$LOCALIZE_TMPDIR/failure.log" << 'ERRLOG' +Error in src/contracts/Token.ts:42 + TypeError: Cannot read property 'balance' of undefined + at transfer (src/contracts/Token.ts:42:15) + at runTest (tests/token.test.ts:18:5) +ERRLOG + +bash scripts/localize-failure.sh "$LOCALIZE_TMPDIR/failure.log" 2>&1 || true + +if [[ -f artifacts/localization.json ]]; then + pass "v7-localize-func-creates-json: localize-failure.sh creates localization.json" + + LOC_FILE=$(python3 -c "import json; d=json.load(open('artifacts/localization.json')); print(d.get('file', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + LOC_CONFIDENCE=$(python3 -c "import json; d=json.load(open('artifacts/localization.json')); print(d.get('confidence', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + LOC_CATEGORY=$(python3 -c "import json; d=json.load(open('artifacts/localization.json')); print(d.get('failure_category', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + + if [[ "$LOC_FILE" != "unknown" && "$LOC_FILE" != "MISSING" ]]; then + pass "v7-localize-func-finds-file: localization identifies file ($LOC_FILE)" + else + fail "v7-localize-func-finds-file: localization did NOT identify file (got: $LOC_FILE)" + fi + + if [[ "$LOC_CONFIDENCE" == "high" || "$LOC_CONFIDENCE" == "medium" ]]; then + pass "v7-localize-func-confidence: localization has good confidence ($LOC_CONFIDENCE)" + else + fail "v7-localize-func-confidence: localization has low confidence (got: $LOC_CONFIDENCE)" + fi + + if [[ "$LOC_CATEGORY" != "unknown" && "$LOC_CATEGORY" != "MISSING" ]]; then + pass "v7-localize-func-category: localization identifies category ($LOC_CATEGORY)" + else + fail "v7-localize-func-category: localization did NOT identify category (got: $LOC_CATEGORY)" + fi +else + fail "v7-localize-func-creates-json: localization.json NOT created" + fail "v7-localize-func-finds-file: (skipped — no JSON)" + fail "v7-localize-func-confidence: (skipped — no JSON)" + fail "v7-localize-func-category: (skipped — no JSON)" +fi + +# Test with empty log +echo "" > "$LOCALIZE_TMPDIR/empty.log" +bash scripts/localize-failure.sh "$LOCALIZE_TMPDIR/empty.log" 2>&1 || true + +if [[ -f artifacts/localization.json ]]; then + EMPTY_CONFIDENCE=$(python3 -c "import json; d=json.load(open('artifacts/localization.json')); print(d.get('confidence', 'MISSING'))" 2>/dev/null || echo "PARSE_ERROR") + if [[ "$EMPTY_CONFIDENCE" == "low" ]]; then + pass "v7-localize-func-empty-low-confidence: empty log produces low confidence" + else + fail "v7-localize-func-empty-low-confidence: empty log does NOT produce low confidence (got: $EMPTY_CONFIDENCE)" + fi +else + fail "v7-localize-func-empty-low-confidence: (skipped — no JSON)" +fi + +rm -rf "$LOCALIZE_TMPDIR" +rm -f artifacts/localization.json + +# ───────────────────────────────────────────────── +# v7 TEST-5: Extract Requirements Script +# ───────────────────────────────────────────────── +echo "" +echo "=== Extract Requirements Script ===" + +if [[ -f scripts/extract-requirements.sh ]]; then + pass "v7-extract-req-script-exists: scripts/extract-requirements.sh exists" +else + fail "v7-extract-req-script-exists: scripts/extract-requirements.sh NOT found" +fi + +if [[ -x scripts/extract-requirements.sh ]]; then + pass "v7-extract-req-script-executable: scripts/extract-requirements.sh is executable" +else + fail "v7-extract-req-script-executable: scripts/extract-requirements.sh is NOT executable" +fi + +check "v7-extract-req-script-syntax: extract-requirements.sh passes bash -n" bash -n scripts/extract-requirements.sh + +# ───────────────────────────────────────────────── +# v7 TEST-6: Extract Requirements Functional +# ───────────────────────────────────────────────── +echo "" +echo "=== Extract Requirements Functional ===" + +EXTRACT_TMPDIR=$(mktemp -d) +cat > "$EXTRACT_TMPDIR/requirements.md" << 'REQMD' +# Requirements + +1. Users can create tokens with a name and symbol +2. Token transfers should deduct from sender and credit receiver +3. Admin can mint new tokens +REQMD + +bash scripts/extract-requirements.sh "$EXTRACT_TMPDIR/requirements.md" 2>&1 || true + +if [[ -f artifacts/evaluation/spec-requirements.yaml ]]; then + pass "v7-extract-req-func-creates-yaml: extract-requirements.sh creates spec-requirements.yaml" + + REQ_COUNT=$(grep -c 'id:' artifacts/evaluation/spec-requirements.yaml || true) + if [[ "$REQ_COUNT" -ge 3 ]]; then + pass "v7-extract-req-func-count: extracted >= 3 requirements ($REQ_COUNT found)" + else + fail "v7-extract-req-func-count: extracted < 3 requirements ($REQ_COUNT found)" + fi + + if grep -q 'has_test:' artifacts/evaluation/spec-requirements.yaml; then + pass "v7-extract-req-func-has-test-field: spec-requirements.yaml has has_test field" + else + fail "v7-extract-req-func-has-test-field: spec-requirements.yaml missing has_test field" + fi + + if grep -q 'priority:' artifacts/evaluation/spec-requirements.yaml; then + pass "v7-extract-req-func-priority-field: spec-requirements.yaml has priority field" + else + fail "v7-extract-req-func-priority-field: spec-requirements.yaml missing priority field" + fi +else + fail "v7-extract-req-func-creates-yaml: spec-requirements.yaml NOT created" + fail "v7-extract-req-func-count: (skipped — no YAML)" + fail "v7-extract-req-func-has-test-field: (skipped — no YAML)" + fail "v7-extract-req-func-priority-field: (skipped — no YAML)" +fi + +rm -rf "$EXTRACT_TMPDIR" +rm -f artifacts/evaluation/spec-requirements.yaml + +# ───────────────────────────────────────────────── +# v7 TEST-7: Score Build Script +# ───────────────────────────────────────────────── +echo "" +echo "=== Score Build Script ===" + +if [[ -f scripts/score-build.sh ]]; then + pass "v7-score-build-script-exists: scripts/score-build.sh exists" +else + fail "v7-score-build-script-exists: scripts/score-build.sh NOT found" +fi + +if [[ -x scripts/score-build.sh ]]; then + pass "v7-score-build-script-executable: scripts/score-build.sh is executable" +else + fail "v7-score-build-script-executable: scripts/score-build.sh is NOT executable" +fi + +check "v7-score-build-script-syntax: score-build.sh passes bash -n" bash -n scripts/score-build.sh + +# ───────────────────────────────────────────────── +# v7 TEST-8: Score Build Functional +# ───────────────────────────────────────────────── +echo "" +echo "=== Score Build Functional ===" + +# Clean up any leftover artifacts +rm -rf artifacts/evaluation artifacts/testing artifacts/findings-ledger.md + +bash scripts/score-build.sh 2>&1 || true + +if [[ -f artifacts/evaluation/progress-tracker.yaml ]]; then + pass "v7-score-build-func-creates-yaml: score-build.sh creates progress-tracker.yaml" + + if grep -q 'spec_coverage:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-spec-dim: progress-tracker has spec_coverage dimension" + else + fail "v7-score-build-func-spec-dim: progress-tracker missing spec_coverage dimension" + fi + + if grep -q 'security_delta:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-security-dim: progress-tracker has security_delta dimension" + else + fail "v7-score-build-func-security-dim: progress-tracker missing security_delta dimension" + fi + + if grep -q 'mutation_score:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-mutation-dim: progress-tracker has mutation_score dimension" + else + fail "v7-score-build-func-mutation-dim: progress-tracker missing mutation_score dimension" + fi + + if grep -q 'code_health:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-health-dim: progress-tracker has code_health dimension" + else + fail "v7-score-build-func-health-dim: progress-tracker missing code_health dimension" + fi + + if grep -q 'overall_verdict:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-verdict: progress-tracker has overall_verdict" + else + fail "v7-score-build-func-verdict: progress-tracker missing overall_verdict" + fi + + if grep -q 'threshold:' artifacts/evaluation/progress-tracker.yaml; then + pass "v7-score-build-func-thresholds: progress-tracker has threshold entries" + else + fail "v7-score-build-func-thresholds: progress-tracker missing threshold entries" + fi +else + fail "v7-score-build-func-creates-yaml: progress-tracker.yaml NOT created" + fail "v7-score-build-func-spec-dim: (skipped — no YAML)" + fail "v7-score-build-func-security-dim: (skipped — no YAML)" + fail "v7-score-build-func-mutation-dim: (skipped — no YAML)" + fail "v7-score-build-func-health-dim: (skipped — no YAML)" + fail "v7-score-build-func-verdict: (skipped — no YAML)" + fail "v7-score-build-func-thresholds: (skipped — no YAML)" +fi + +rm -rf artifacts/evaluation + +# ───────────────────────────────────────────────── +# v7 TEST-9: Build Repo Map Script +# ───────────────────────────────────────────────── +echo "" +echo "=== Build Repo Map Script ===" + +if [[ -f scripts/build-repo-map.sh ]]; then + pass "v7-repo-map-script-exists: scripts/build-repo-map.sh exists" +else + fail "v7-repo-map-script-exists: scripts/build-repo-map.sh NOT found" +fi + +if [[ -x scripts/build-repo-map.sh ]]; then + pass "v7-repo-map-script-executable: scripts/build-repo-map.sh is executable" +else + fail "v7-repo-map-script-executable: scripts/build-repo-map.sh is NOT executable" +fi + +check "v7-repo-map-script-syntax: build-repo-map.sh passes bash -n" bash -n scripts/build-repo-map.sh + +# ───────────────────────────────────────────────── +# v7 TEST-10: Build Repo Map Functional +# ───────────────────────────────────────────────── +echo "" +echo "=== Build Repo Map Functional ===" + +REPOMAP_TMPDIR=$(mktemp -d) + +# Create a minimal ABI +cat > "$REPOMAP_TMPDIR/abi.json" << 'ABIJSON' +[ + {"type": "function", "name": "transfer", "inputs": [{"name": "to", "type": "address"}, {"name": "amount", "type": "u256"}], "outputs": [{"type": "bool"}]}, + {"type": "function", "name": "balanceOf", "inputs": [{"name": "owner", "type": "address"}], "outputs": [{"type": "u256"}]}, + {"type": "event", "name": "Transfer", "inputs": [{"name": "from", "type": "address"}, {"name": "to", "type": "address"}, {"name": "value", "type": "u256"}]} +] +ABIJSON + +rm -f artifacts/repo-map.md +bash scripts/build-repo-map.sh "$REPOMAP_TMPDIR/abi.json" "" "" 2>&1 || true + +if [[ -f artifacts/repo-map.md ]]; then + pass "v7-repo-map-func-creates-md: build-repo-map.sh creates repo-map.md" + + if grep -q 'Contract Layer' artifacts/repo-map.md; then + pass "v7-repo-map-func-contract-layer: repo-map.md has Contract Layer section" + else + fail "v7-repo-map-func-contract-layer: repo-map.md missing Contract Layer section" + fi + + if grep -q 'transfer' artifacts/repo-map.md; then + pass "v7-repo-map-func-method-listed: repo-map.md lists transfer method" + else + fail "v7-repo-map-func-method-listed: repo-map.md does NOT list transfer method" + fi + + if grep -q 'Frontend Layer' artifacts/repo-map.md; then + pass "v7-repo-map-func-frontend-layer: repo-map.md has Frontend Layer section" + else + fail "v7-repo-map-func-frontend-layer: repo-map.md missing Frontend Layer section" + fi + + if grep -q 'Backend Layer' artifacts/repo-map.md; then + pass "v7-repo-map-func-backend-layer: repo-map.md has Backend Layer section" + else + fail "v7-repo-map-func-backend-layer: repo-map.md missing Backend Layer section" + fi + + if grep -q 'Cross-Layer Integrity' artifacts/repo-map.md; then + pass "v7-repo-map-func-integrity: repo-map.md has Cross-Layer Integrity section" + else + fail "v7-repo-map-func-integrity: repo-map.md missing Cross-Layer Integrity section" + fi + + REPOMAP_LINES=$(wc -l < artifacts/repo-map.md | tr -d ' ') + if [[ "$REPOMAP_LINES" -lt 300 ]]; then + pass "v7-repo-map-func-under-300: repo-map.md is under 300 lines ($REPOMAP_LINES lines)" + else + fail "v7-repo-map-func-under-300: repo-map.md is >= 300 lines ($REPOMAP_LINES lines)" + fi +else + fail "v7-repo-map-func-creates-md: repo-map.md NOT created" + fail "v7-repo-map-func-contract-layer: (skipped — no file)" + fail "v7-repo-map-func-method-listed: (skipped — no file)" + fail "v7-repo-map-func-frontend-layer: (skipped — no file)" + fail "v7-repo-map-func-backend-layer: (skipped — no file)" + fail "v7-repo-map-func-integrity: (skipped — no file)" + fail "v7-repo-map-func-under-300: (skipped — no file)" +fi + +rm -rf "$REPOMAP_TMPDIR" +rm -f artifacts/repo-map.md + +# ───────────────────────────────────────────────── +# v7 TEST-11: Buidl Optimize Command +# ───────────────────────────────────────────────── +echo "" +echo "=== Buidl Optimize Command ===" + +if [[ -f commands/buidl-optimize.md ]]; then + pass "v7-optimize-cmd-exists: commands/buidl-optimize.md exists" +else + fail "v7-optimize-cmd-exists: commands/buidl-optimize.md NOT found" +fi + +# Check frontmatter +if head -5 commands/buidl-optimize.md | grep -q 'description:'; then + pass "v7-optimize-cmd-description: buidl-optimize.md has description in frontmatter" +else + fail "v7-optimize-cmd-description: buidl-optimize.md missing description in frontmatter" +fi + +if head -10 commands/buidl-optimize.md | grep -q 'argument-hint:'; then + pass "v7-optimize-cmd-argument-hint: buidl-optimize.md has argument-hint in frontmatter" +else + fail "v7-optimize-cmd-argument-hint: buidl-optimize.md missing argument-hint in frontmatter" +fi + +if head -10 commands/buidl-optimize.md | grep -q 'allowed-tools:'; then + pass "v7-optimize-cmd-allowed-tools: buidl-optimize.md has allowed-tools in frontmatter" +else + fail "v7-optimize-cmd-allowed-tools: buidl-optimize.md missing allowed-tools in frontmatter" +fi + +# Check FORBIDDEN section +if grep -q 'FORBIDDEN' commands/buidl-optimize.md; then + pass "v7-optimize-cmd-forbidden: buidl-optimize.md has FORBIDDEN section" +else + fail "v7-optimize-cmd-forbidden: buidl-optimize.md missing FORBIDDEN section" +fi + +# Check supported metrics +for metric in gas bundle_size test_time throughput; do + if grep -q "$metric" commands/buidl-optimize.md; then + pass "v7-optimize-cmd-metric-${metric}: buidl-optimize.md supports $metric metric" + else + fail "v7-optimize-cmd-metric-${metric}: buidl-optimize.md missing $metric metric" + fi +done + +# Check max_cycles default +if grep -q '10' commands/buidl-optimize.md; then + pass "v7-optimize-cmd-max-cycles: buidl-optimize.md mentions default 10 max cycles" +else + fail "v7-optimize-cmd-max-cycles: buidl-optimize.md missing default 10 max cycles" +fi + +# Check output artifacts +if grep -q 'summary.md' commands/buidl-optimize.md; then + pass "v7-optimize-cmd-output-summary: buidl-optimize.md references summary.md output" +else + fail "v7-optimize-cmd-output-summary: buidl-optimize.md missing summary.md output reference" +fi + +if grep -q 'best-result.json' commands/buidl-optimize.md; then + pass "v7-optimize-cmd-output-best: buidl-optimize.md references best-result.json output" +else + fail "v7-optimize-cmd-output-best: buidl-optimize.md missing best-result.json output reference" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-12: Mutation Gate in Orchestrator +# ───────────────────────────────────────────────── +echo "" +echo "=== Mutation Gate in Orchestrator ===" + +if grep -q 'mutate-contract.sh' commands/buidl.md; then + pass "v7-mutation-gate-in-buidl: buidl.md references mutate-contract.sh" +else + fail "v7-mutation-gate-in-buidl: buidl.md does NOT reference mutate-contract.sh" +fi + +if grep -q 'mutation_score' commands/buidl.md; then + pass "v7-mutation-gate-score-check: buidl.md checks mutation_score" +else + fail "v7-mutation-gate-score-check: buidl.md does NOT check mutation_score" +fi + +if grep -q '0.70' commands/buidl.md; then + pass "v7-mutation-gate-threshold: buidl.md has 0.70 threshold" +else + fail "v7-mutation-gate-threshold: buidl.md missing 0.70 threshold" +fi + +# Mutation gate should be in Phase 5 (REVIEW section) +if sed -n '/PHASE 5/,/PHASE 6/p' commands/buidl.md | grep -q 'mutate-contract.sh'; then + pass "v7-mutation-gate-phase5: mutation gate is in Phase 5 section" +else + fail "v7-mutation-gate-phase5: mutation gate is NOT in Phase 5 section" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-13: Structured Repair Phases in Orchestrator +# ───────────────────────────────────────────────── +echo "" +echo "=== Structured Repair Phases ===" + +if grep -q 'Phase R1' commands/buidl.md; then + pass "v7-repair-r1-in-buidl: buidl.md has Phase R1 (LOCALIZE)" +else + fail "v7-repair-r1-in-buidl: buidl.md missing Phase R1" +fi + +if grep -q 'Phase R2' commands/buidl.md; then + pass "v7-repair-r2-in-buidl: buidl.md has Phase R2 (PATCH)" +else + fail "v7-repair-r2-in-buidl: buidl.md missing Phase R2" +fi + +if grep -q 'Phase R3' commands/buidl.md; then + pass "v7-repair-r3-in-buidl: buidl.md has Phase R3 (VALIDATE)" +else + fail "v7-repair-r3-in-buidl: buidl.md missing Phase R3" +fi + +if grep -q 'localize-failure.sh' commands/buidl.md; then + pass "v7-repair-localize-ref: buidl.md references localize-failure.sh" +else + fail "v7-repair-localize-ref: buidl.md does NOT reference localize-failure.sh" +fi + +if grep -q 'localization.json' commands/buidl.md; then + pass "v7-repair-localization-json: buidl.md references localization.json output" +else + fail "v7-repair-localization-json: buidl.md does NOT reference localization.json" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-14: Score Build in Orchestrator +# ───────────────────────────────────────────────── +echo "" +echo "=== Score Build in Orchestrator ===" + +if grep -q 'score-build.sh' commands/buidl.md; then + pass "v7-score-build-in-buidl: buidl.md references score-build.sh" +else + fail "v7-score-build-in-buidl: buidl.md does NOT reference score-build.sh" +fi + +if grep -q 'progress-tracker.yaml' commands/buidl.md; then + pass "v7-score-build-tracker-ref: buidl.md references progress-tracker.yaml" +else + fail "v7-score-build-tracker-ref: buidl.md does NOT reference progress-tracker.yaml" +fi + +if grep -q 'spec_coverage' commands/buidl.md; then + pass "v7-score-build-spec-dim: buidl.md has spec_coverage dimension" +else + fail "v7-score-build-spec-dim: buidl.md missing spec_coverage dimension" +fi + +if grep -q 'security_delta' commands/buidl.md; then + pass "v7-score-build-security-dim: buidl.md has security_delta dimension" +else + fail "v7-score-build-security-dim: buidl.md missing security_delta dimension" +fi + +if grep -q 'code_health' commands/buidl.md; then + pass "v7-score-build-health-dim: buidl.md has code_health dimension" +else + fail "v7-score-build-health-dim: buidl.md missing code_health dimension" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-15: Repo Map in Orchestrator +# ───────────────────────────────────────────────── +echo "" +echo "=== Repo Map in Orchestrator ===" + +if grep -q 'build-repo-map.sh' commands/buidl.md; then + pass "v7-repo-map-in-buidl: buidl.md references build-repo-map.sh" +else + fail "v7-repo-map-in-buidl: buidl.md does NOT reference build-repo-map.sh" +fi + +if grep -q 'repo-map.md' commands/buidl.md; then + pass "v7-repo-map-artifact-ref: buidl.md references repo-map.md artifact" +else + fail "v7-repo-map-artifact-ref: buidl.md does NOT reference repo-map.md artifact" +fi + +# Check that repo map is generated after ABI lock +if sed -n '/ABI Lock/,/Issue Check/p' commands/buidl.md | grep -q 'build-repo-map.sh'; then + pass "v7-repo-map-after-abi-lock: repo map generated after ABI lock in buidl.md" +else + fail "v7-repo-map-after-abi-lock: repo map NOT generated after ABI lock in buidl.md" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-16: Agent Repo Map References +# ───────────────────────────────────────────────── +echo "" +echo "=== Agent Repo Map References ===" + +V7_AGENTS=(cross-layer-validator loop-builder loop-explorer loop-researcher loop-reviewer opnet-auditor opnet-backend-dev opnet-contract-dev opnet-deployer opnet-e2e-tester opnet-frontend-dev opnet-ui-tester) + +for agent in "${V7_AGENTS[@]}"; do + if grep -q 'repo-map.md' "agents/${agent}.md"; then + pass "v7-agent-repomap-${agent}: ${agent}.md references repo-map.md" + else + fail "v7-agent-repomap-${agent}: ${agent}.md does NOT reference repo-map.md" + fi +done + +# Verify adversarial agents do NOT have repo-map reference +for agent in opnet-adversarial-auditor opnet-adversarial-tester; do + if grep -q 'repo-map.md' "agents/${agent}.md" 2>/dev/null; then + fail "v7-agent-repomap-no-${agent}: ${agent}.md should NOT reference repo-map.md" + else + pass "v7-agent-repomap-no-${agent}: ${agent}.md correctly does NOT reference repo-map.md" + fi +done + +# ───────────────────────────────────────────────── +# v7 TEST-17: Localize Mode in Reviewer +# ───────────────────────────────────────────────── +echo "" +echo "=== Localize Mode in Reviewer ===" + +if grep -q 'Localize Mode' agents/loop-reviewer.md; then + pass "v7-localize-mode-in-reviewer: loop-reviewer.md has Localize Mode section" +else + fail "v7-localize-mode-in-reviewer: loop-reviewer.md missing Localize Mode section" +fi + +if grep -q 'max_turns.*5' agents/loop-reviewer.md; then + pass "v7-localize-mode-max-turns: Localize Mode has max_turns 5" +else + fail "v7-localize-mode-max-turns: Localize Mode missing max_turns 5" +fi + +if grep -q 'READ-ONLY' agents/loop-reviewer.md; then + pass "v7-localize-mode-readonly: Localize Mode is READ-ONLY" +else + fail "v7-localize-mode-readonly: Localize Mode missing READ-ONLY constraint" +fi + +if grep -q 'FORBIDDEN in Localize Mode' agents/loop-reviewer.md; then + pass "v7-localize-mode-forbidden: Localize Mode has FORBIDDEN section" +else + fail "v7-localize-mode-forbidden: Localize Mode missing FORBIDDEN section" +fi + +# Localize Mode should appear AFTER Critique Mode +CRITIQUE_LINE=$(grep -n 'Critique Mode' agents/loop-reviewer.md | head -1 | cut -d: -f1) +LOCALIZE_LINE=$(grep -n 'Localize Mode' agents/loop-reviewer.md | head -1 | cut -d: -f1) +if [[ -n "$CRITIQUE_LINE" && -n "$LOCALIZE_LINE" && "$LOCALIZE_LINE" -gt "$CRITIQUE_LINE" ]]; then + pass "v7-localize-mode-after-critique: Localize Mode appears after Critique Mode" +else + fail "v7-localize-mode-after-critique: Localize Mode does NOT appear after Critique Mode" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-18: Buidl Status Updates +# ───────────────────────────────────────────────── +echo "" +echo "=== Buidl Status Updates ===" + +if grep -q 'Mutation' commands/buidl-status.md; then + pass "v7-status-mutation: buidl-status.md shows mutation score" +else + fail "v7-status-mutation: buidl-status.md does NOT show mutation score" +fi + +if grep -q 'mutation-score.json' commands/buidl-status.md; then + pass "v7-status-mutation-json: buidl-status.md references mutation-score.json" +else + fail "v7-status-mutation-json: buidl-status.md does NOT reference mutation-score.json" +fi + +if grep -q 'Build Score' commands/buidl-status.md; then + pass "v7-status-build-score: buidl-status.md shows build score card" +else + fail "v7-status-build-score: buidl-status.md does NOT show build score card" +fi + +if grep -q 'progress-tracker.yaml' commands/buidl-status.md; then + pass "v7-status-tracker-ref: buidl-status.md references progress-tracker.yaml" +else + fail "v7-status-tracker-ref: buidl-status.md does NOT reference progress-tracker.yaml" +fi + +# Verify steps are properly renumbered (old step 7 is now step 9) +if grep -q '^9\. ' commands/buidl-status.md; then + pass "v7-status-renumbered-9: buidl-status.md has step 9" +else + fail "v7-status-renumbered-9: buidl-status.md missing step 9" +fi + +if grep -q '^12\. ' commands/buidl-status.md; then + pass "v7-status-renumbered-12: buidl-status.md has step 12" +else + fail "v7-status-renumbered-12: buidl-status.md missing step 12" +fi + +# ───────────────────────────────────────────────── +# v7 TEST-19: CHANGELOG and README Updates +# ───────────────────────────────────────────────── +echo "" +echo "=== CHANGELOG and README Updates ===" + +if grep -q '\[7\.0\.0\]' CHANGELOG.md; then + pass "v7-changelog-entry: CHANGELOG.md has [7.0.0] entry" +else + fail "v7-changelog-entry: CHANGELOG.md missing [7.0.0] entry" +fi + +# 7.0.0 should come before 6.0.0 +V7_LINE=$(grep -n '\[7\.0\.0\]' CHANGELOG.md | head -1 | cut -d: -f1) +V6_LINE=$(grep -n '\[6\.0\.0\]' CHANGELOG.md | head -1 | cut -d: -f1) +if [[ -n "$V7_LINE" && -n "$V6_LINE" && "$V7_LINE" -lt "$V6_LINE" ]]; then + pass "v7-changelog-order: [7.0.0] comes before [6.0.0] in CHANGELOG" +else + fail "v7-changelog-order: [7.0.0] does NOT come before [6.0.0] in CHANGELOG" +fi + +if grep -q 'buidl-optimize' README.md; then + pass "v7-readme-optimize-cmd: README.md mentions buidl-optimize command" +else + fail "v7-readme-optimize-cmd: README.md does NOT mention buidl-optimize command" +fi + +if grep -q 'v7\.0\.0' README.md; then + pass "v7-readme-version-history: README.md has v7.0.0 in version history" +else + fail "v7-readme-version-history: README.md missing v7.0.0 in version history" +fi + +if grep -q 'Mutation' README.md; then + pass "v7-readme-mutation-feature: README.md mentions mutation testing feature" +else + fail "v7-readme-mutation-feature: README.md does NOT mention mutation testing feature" +fi + +if grep -q 'Structured Repair' README.md; then + pass "v7-readme-repair-feature: README.md mentions structured repair feature" +else + fail "v7-readme-repair-feature: README.md does NOT mention structured repair feature" +fi + +if grep -q 'Goal-Oriented' README.md; then + pass "v7-readme-scoring-feature: README.md mentions goal-oriented evaluation feature" +else + fail "v7-readme-scoring-feature: README.md does NOT mention goal-oriented evaluation feature" +fi + +if grep -q 'Repo Map' README.md; then + pass "v7-readme-repomap-feature: README.md mentions repo map feature" +else + fail "v7-readme-repomap-feature: README.md does NOT mention repo map feature" +fi + # ───────────────────────────────────────────────── # Summary # ─────────────────────────────────────────────────