From 40faa4d75c0009125f8fe8078f0a8a2768786c37 Mon Sep 17 00:00:00 2001 From: Serhan Date: Fri, 15 May 2026 14:25:34 -0700 Subject: [PATCH] fix(checkup): keep review loop running past cost telemetry --- README.md | 2 +- examples/checkup_review_loop_example.py | 3 +- pdd/agentic_checkup.py | 2 +- pdd/agentic_split_orchestrator.py | 5 +-- pdd/checkup_review_loop.py | 13 +++--- pdd/commands/checkup.py | 11 +++-- pdd/pdd_completion.fish | 2 +- pdd/pdd_completion.zsh | 2 +- pdd/prompts/agentic_checkup_python.prompt | 2 +- .../agentic_split_orchestrator_python.prompt | 2 +- pdd/prompts/checkup_review_loop_python.prompt | 6 +-- pdd/prompts/commands/checkup_python.prompt | 4 +- pdd/prompts/pdd_completion_fish.prompt | 2 +- pdd/prompts/pdd_completion_zsh.prompt | 2 +- tests/test_checkup_review_loop.py | 44 +++++++++++-------- 15 files changed, 53 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 14d3fd234..75d7239b1 100644 --- a/README.md +++ b/README.md @@ -2746,7 +2746,7 @@ Options: - `--reviewers ROLES`: Legacy comma-separated review-loop role order, interpreted as `reviewer,fixer` (default: `codex,claude`). - `--reviewer-fallback ROLE`: Optional secondary reviewer role to invoke once if the primary reviewer cannot complete (for example, because of auth, network, sandbox, or CLI failures). The fallback must resolve to a role different from the reviewer and fixer; if it succeeds, it becomes the active reviewer for the remaining loop and the superseded primary's row in the final report is annotated `(optional, superseded by )` so downstream verdict adapters drop the failed primary from the required-reviewer set and resolve to `ship_degraded` instead of `unknown`. - `--max-review-rounds INT`: Maximum primary-reviewer/fixer rounds (default: 5). -- `--max-review-cost FLOAT`: Maximum review-loop LLM cost in USD (default: 10.0). +- `--max-review-cost FLOAT`: Deprecated compatibility flag. Review-loop cost is reported as telemetry and does not stop the codex-review / claude-fix / codex-verify loop (default: 0.0). - `--max-review-minutes FLOAT`: Maximum review-loop wall-clock minutes (default: 90.0). - `--blocking-severities LIST`: Comma-separated severity names used for review-loop reporting and prompt guidance (default: `blocker,critical,medium`). The fixer still receives every valid reviewer finding. - `--continue-on-reviewer-limit`: Report provider, rate, context-window, timeout, auth, network, sandbox, permission, and non-zero-exit reviewer failures as `degraded` instead of `failed`. Degraded reviewers are still not clean unless a configured fallback reviewer completes successfully and takes over as the active reviewer. diff --git a/examples/checkup_review_loop_example.py b/examples/checkup_review_loop_example.py index 6bd8edfcf..12a9b3204 100644 --- a/examples/checkup_review_loop_example.py +++ b/examples/checkup_review_loop_example.py @@ -49,7 +49,6 @@ def main() -> None: config = ReviewLoopConfig( reviewers=parse_reviewers("codex,claude"), max_rounds=1, - max_cost=1.0, max_minutes=5.0, ) @@ -94,4 +93,4 @@ def main() -> None: console.print(f"Model: {model}") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pdd/agentic_checkup.py b/pdd/agentic_checkup.py index 17d1b152b..d27981410 100644 --- a/pdd/agentic_checkup.py +++ b/pdd/agentic_checkup.py @@ -322,7 +322,7 @@ def run_agentic_checkup( fixer: Optional[str] = None, reviewer_fallback: Optional[str] = None, max_review_rounds: int = 5, - max_review_cost: float = 10.0, + max_review_cost: float = 0.0, max_review_minutes: float = 90.0, require_all_reviewers_clean: bool = True, continue_on_reviewer_limit: bool = False, diff --git a/pdd/agentic_split_orchestrator.py b/pdd/agentic_split_orchestrator.py index d97dae4f4..0e776e3c6 100644 --- a/pdd/agentic_split_orchestrator.py +++ b/pdd/agentic_split_orchestrator.py @@ -1404,9 +1404,8 @@ def _check_budget(at_step: str) -> Optional[Tuple[bool, str, float, str, List[st """Budget guard — return abort tuple if --max-cost crossed, else None. Persists state so the next run without --max-cost (or with a higher - one) resumes from ``at_step``. Mirrors the pattern in - ``pdd/checkup_review_loop.py:1849-1862`` (max_cost_reached flag + - stop_reason). + one) resumes from ``at_step`` with the max_cost_reached flag and + stop_reason recorded. MUST be called both at the top of each step iteration AND after every ``total_cost += ...`` accumulation that precedes a possible early diff --git a/pdd/checkup_review_loop.py b/pdd/checkup_review_loop.py index 03fd9a18c..9091ac338 100644 --- a/pdd/checkup_review_loop.py +++ b/pdd/checkup_review_loop.py @@ -582,7 +582,9 @@ class ReviewLoopConfig: reviewer_fallback: Optional[str] = None review_only: bool = False max_rounds: int = 5 - max_cost: float = 10.0 + # Deprecated compatibility field. Cost is reported but does not stop the + # implement/review loop; max_rounds and max_minutes bound normal execution. + max_cost: float = 0.0 max_minutes: float = 90.0 # Kept for CLI/API compatibility. The loop has one authoritative reviewer, # so this is no longer used as a ship gate. @@ -2998,11 +3000,11 @@ def _fix_dispute_note(fix: FixResult, finding: ReviewFinding) -> str: def _budget_exhausted( - config: ReviewLoopConfig, - state: ReviewLoopState, + _config: ReviewLoopConfig, + _state: ReviewLoopState, deadline: float, ) -> bool: - return state.total_cost >= config.max_cost or time.monotonic() >= deadline + return time.monotonic() >= deadline def _mark_budget_exhausted( @@ -3010,9 +3012,6 @@ def _mark_budget_exhausted( state: ReviewLoopState, deadline: float, ) -> None: - if state.total_cost >= config.max_cost: - state.max_cost_reached = True - state.stop_reason = f"Max review cost reached: ${config.max_cost:.2f}." if time.monotonic() >= deadline: state.max_duration_reached = True state.stop_reason = f"Max review duration reached: {config.max_minutes:g} minutes." diff --git a/pdd/commands/checkup.py b/pdd/commands/checkup.py index 6c1694dc0..5c045ffc3 100644 --- a/pdd/commands/checkup.py +++ b/pdd/commands/checkup.py @@ -129,9 +129,12 @@ @click.option( "--max-review-cost", type=float, - default=10.0, + default=0.0, show_default=True, - help="Maximum review-loop LLM cost in USD.", + help=( + "Deprecated compatibility flag. Review-loop cost is reported but does " + "not stop the implement/review loop." + ), ) @click.option( "--max-review-minutes", @@ -279,9 +282,9 @@ def checkup( "--max-review-rounds must be >= 1.", param_hint="'--max-review-rounds'", ) - if review_loop and max_review_cost <= 0: + if review_loop and max_review_cost < 0: raise click.BadParameter( - "--max-review-cost must be > 0.", + "--max-review-cost must be >= 0.", param_hint="'--max-review-cost'", ) if review_loop and max_review_minutes <= 0: diff --git a/pdd/pdd_completion.fish b/pdd/pdd_completion.fish index 01c915511..feaa00f73 100644 --- a/pdd/pdd_completion.fish +++ b/pdd/pdd_completion.fish @@ -149,7 +149,7 @@ complete -c pdd -n "__fish_seen_subcommand_from checkup" -l reviewer -x -d "Prim complete -c pdd -n "__fish_seen_subcommand_from checkup" -l fixer -x -d "Fixer role" complete -c pdd -n "__fish_seen_subcommand_from checkup" -l reviewer-fallback -x -d "Fallback reviewer when primary fails" complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-rounds -x -d "Max review rounds" -complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-cost -x -d "Max review cost (USD)" +complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-cost -x -d "Deprecated; review cost is report-only" complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-minutes -x -d "Max review wall-clock minutes" complete -c pdd -n "__fish_seen_subcommand_from checkup" -l require-all-reviewers-clean -d "Require all reviewers clean" complete -c pdd -n "__fish_seen_subcommand_from checkup" -l no-require-all-reviewers-clean -d "Do not require all reviewers clean" diff --git a/pdd/pdd_completion.zsh b/pdd/pdd_completion.zsh index 997cd4d47..64af09fb7 100644 --- a/pdd/pdd_completion.zsh +++ b/pdd/pdd_completion.zsh @@ -466,7 +466,7 @@ _pdd_checkup() { '--fixer[Fixer role]:fixer:' \ '--reviewer-fallback[Fallback reviewer when primary fails]:reviewer:' \ '--max-review-rounds[Max review rounds]:rounds:' \ - '--max-review-cost[Max review cost (USD)]:cost:' \ + '--max-review-cost[Deprecated; review cost is report-only]:cost:' \ '--max-review-minutes[Max review wall-clock minutes]:minutes:' \ '--require-all-reviewers-clean[Require all reviewers clean]' \ '--no-require-all-reviewers-clean[Do not require all reviewers clean]' \ diff --git a/pdd/prompts/agentic_checkup_python.prompt b/pdd/prompts/agentic_checkup_python.prompt index 45a430dc1..4be502205 100644 --- a/pdd/prompts/agentic_checkup_python.prompt +++ b/pdd/prompts/agentic_checkup_python.prompt @@ -9,7 +9,7 @@ Write the `pdd/agentic_checkup.py` module. Entry point for the agentic checkup workflow. Accepts a GitHub issue URL, fetches issue content and comments, loads project context (architecture.json, .pddrc), then dispatches to the multi-step orchestrator that explores the project, identifies problems, and optionally fixes them. % Requirements -1. Function: `run_agentic_checkup(issue_url: str, *, verbose: bool = False, quiet: bool = False, no_fix: bool = False, timeout_adder: float = 0.0, use_github_state: bool = True, reasoning_time: Optional[float] = None, pr_url: Optional[str] = None, review_loop: bool = False, review_only: bool = False, reviewers: str = "codex,claude", reviewer: Optional[str] = None, fixer: Optional[str] = None, reviewer_fallback: Optional[str] = None, max_review_rounds: int = 5, max_review_cost: float = 10.0, max_review_minutes: float = 90.0, require_all_reviewers_clean: bool = True, continue_on_reviewer_limit: bool = False, require_final_fresh_review: bool = True, blocking_severities: Optional[str] = None, clean_reviewer_states: Optional[str] = None, fallback_reviewer_on_failure: bool = False) -> Tuple[bool, str, float, str]` +1. Function: `run_agentic_checkup(issue_url: str, *, verbose: bool = False, quiet: bool = False, no_fix: bool = False, timeout_adder: float = 0.0, use_github_state: bool = True, reasoning_time: Optional[float] = None, pr_url: Optional[str] = None, review_loop: bool = False, review_only: bool = False, reviewers: str = "codex,claude", reviewer: Optional[str] = None, fixer: Optional[str] = None, reviewer_fallback: Optional[str] = None, max_review_rounds: int = 5, max_review_cost: float = 0.0, max_review_minutes: float = 90.0, require_all_reviewers_clean: bool = True, continue_on_reviewer_limit: bool = False, require_final_fresh_review: bool = True, blocking_severities: Optional[str] = None, clean_reviewer_states: Optional[str] = None, fallback_reviewer_on_failure: bool = False) -> Tuple[bool, str, float, str]` 2. Return 4-tuple: (success, message, total_cost, model_used) 3. Check `gh` CLI availability via `_check_gh_cli()` (reused from `agentic_change.py`) 4. Parse GitHub issue URL via `_parse_issue_url()` (reused from `agentic_change.py`) diff --git a/pdd/prompts/agentic_split_orchestrator_python.prompt b/pdd/prompts/agentic_split_orchestrator_python.prompt index 45c912729..83b4f3b19 100644 --- a/pdd/prompts/agentic_split_orchestrator_python.prompt +++ b/pdd/prompts/agentic_split_orchestrator_python.prompt @@ -36,7 +36,7 @@ Orchestrator for the agentic split workflow. Diagnoses whether a PDD dev unit ha - `delete_dead`: opt-in dead symbol deletion in step 6 (last child only) - `intent`: user hint (CLI flag); if set, step 0 is skipped - `no_phase_extraction`: skip step 6a (phase decomposition) - - `max_cost` (Issue #1384): when not None, the orchestrator checks `state.total_cost >= max_cost` BOTH at the top of every iteration of the step loop AND immediately after every `total_cost += ...` accumulation that precedes a possible early return (diagnose_only, propose_only, LEAVE_ALONE, hard-stop) AND after each per-child fan-out cost in steps 6a/6/8 (post per-child state persistence). Without the post-accumulation check, an LLM step that adds cost and then immediately exits via early return — or a multi-child fan-out — can bypass the cap by one full step's cost (or N × per-child cost). Implement as a closure `_check_budget(at_step) -> Optional[abort_tuple]` that returns the standard 5-tuple when crossed, else `None`. On hit: sets `state["max_cost_reached"] = True`, persists state via `save_workflow_state`, restores signal handlers, calls `clear_agentic_progress()`, and returns the standard 5-tuple with `success=False` and a stop message of the form `"Aborted at step : --max-cost ${cap:.2f} reached (spent ${cost:.2f}). Re-run without --max-cost or with a higher cap to resume."`. The next invocation without `--max-cost` (or with a higher value) resumes from the same state. State persistence ordering matters: budget check fires AFTER `state["step_outputs"][step] = output` AND `state["last_completed_step"] = step` so resume picks up the next step rather than re-running the just-completed one. For `9_refine_check`, also parse the paid `RefineCheck` output and persist `_pending_refine` BEFORE that post-step budget check; otherwise a max-cost abort after step 9 can save `last_completed_step="9_refine_check"` but lose the focused-refinement request. Mirrors the `max_cost_reached` pattern in `pdd/checkup_review_loop.py:1849-1862`. + - `max_cost` (Issue #1384): when not None, the orchestrator checks `state.total_cost >= max_cost` BOTH at the top of every iteration of the step loop AND immediately after every `total_cost += ...` accumulation that precedes a possible early return (diagnose_only, propose_only, LEAVE_ALONE, hard-stop) AND after each per-child fan-out cost in steps 6a/6/8 (post per-child state persistence). Without the post-accumulation check, an LLM step that adds cost and then immediately exits via early return — or a multi-child fan-out — can bypass the cap by one full step's cost (or N × per-child cost). Implement as a closure `_check_budget(at_step) -> Optional[abort_tuple]` that returns the standard 5-tuple when crossed, else `None`. On hit: sets `state["max_cost_reached"] = True`, persists state via `save_workflow_state`, restores signal handlers, calls `clear_agentic_progress()`, and returns the standard 5-tuple with `success=False` and a stop message of the form `"Aborted at step : --max-cost ${cap:.2f} reached (spent ${cost:.2f}). Re-run without --max-cost or with a higher cap to resume."`. The next invocation without `--max-cost` (or with a higher value) resumes from the same state. State persistence ordering matters: budget check fires AFTER `state["step_outputs"][step] = output` AND `state["last_completed_step"] = step` so resume picks up the next step rather than re-running the just-completed one. For `9_refine_check`, also parse the paid `RefineCheck` output and persist `_pending_refine` BEFORE that post-step budget check; otherwise a max-cost abort after step 9 can save `last_completed_step="9_refine_check"` but lose the focused-refinement request. This split-specific cap is independent from `pdd checkup --review-loop`, where review cost is report-only telemetry. 5. **Path sanitization helpers** (Issue #1384): expose two module-level helpers that the orchestrator MUST call between the Step 4 LLM output and any filesystem operation: - `_sanitize_path_string(value)`: when `value` is a `str`, strip whitespace, remove leading markdown backticks, then strip prose tails by truncating at the first whitespace / em-dash / hash / remaining backtick character. This preserves common markdown-wrapped paths like `` `pdd/foo.py` `` as `pdd/foo.py` while still dropping prose tails. Non-string inputs pass through unchanged. Empty strings remain empty. diff --git a/pdd/prompts/checkup_review_loop_python.prompt b/pdd/prompts/checkup_review_loop_python.prompt index fa145dfc3..3fead33ba 100644 --- a/pdd/prompts/checkup_review_loop_python.prompt +++ b/pdd/prompts/checkup_review_loop_python.prompt @@ -104,12 +104,10 @@ Add a PR-mode primary-reviewer/fixer review loop for `pdd checkup`. 17b. The fixer prompt MUST ask the fixer to address every valid, in-scope finding when practical, while prioritizing the configured blocking severities because those determine whether the loop continues. It must explain that "focused fixes" means avoiding unrelated refactors and broad churn, not leaving real issues unfixed. If the fixer leaves any valid finding unfixed, it must return `partially_fixed` or `blocked` with a concrete rationale. 18. The verifier pass is also the next full review pass: it must verify prior fixes and then re-review the whole PR. If it returns actionable findings, those findings become the next round's fixer input. Do not run an extra reviewer at the start of the next round unless there is no pending verifier finding set. This models the manual Codex-review / Claude-fix / Codex-review workflow and avoids two independent reviewers arguing. 19. A reviewer that hits a provider/rate/quota/context-window limit, times out, or returns unparsable output produces `status` in `{"failed", "degraded", "missing"}` (use `_failure_status` to classify based on output keywords and `continue_on_reviewer_limit`). With the default `continue_on_reviewer_limit=False`, provider/rate/quota/context-window/timeout failures are `failed`; with the flag enabled they are `degraded`. Such statuses are NOT clean even if `clean_reviewer_states` is widened — preserve safety. Do not classify normal finding prose as degraded merely because it contains the word "context"; require context-window/length/limit wording. -20. Budget caps: +20. Loop bounds: - `max_rounds` (int, ≥ 1) - - `max_cost` (float USD, > 0) - `max_minutes` (float wall clock, > 0) - Crossing any cap stops the loop, sets the relevant `*_reached` flag on state, and renders a final report whose reviewer-status reflects the unfinished round. -20a. If a cap is crossed during a successful fixer call, the loop MUST still commit and push the completed fixer changes before stopping. Push/commit is local git/GitHub plumbing, not another LLM spend. Stop before verifier/fresh-review when the cap is reached after push, so the final report correctly marks the fix as unverified but does not strand completed changes only in the local worktree. + Cost is reported but MUST NOT stop the normal implement/review loop. The loop continues until the reviewer reports merge-ready/no actionable findings, `max_rounds` is reached, a hard reviewer/fixer/push failure occurs, or the wall-clock duration is reached. `max_cost` is a deprecated compatibility field and does not set `max_cost_reached`. % Persisted artifacts 21. Artifacts live under `/.pdd/checkup-review-loop/issue-{issue_number}-pr-{pr_number}/`. The directory is created before the first reviewer runs. diff --git a/pdd/prompts/commands/checkup_python.prompt b/pdd/prompts/commands/checkup_python.prompt index e323c05a9..865337d8e 100644 --- a/pdd/prompts/commands/checkup_python.prompt +++ b/pdd/prompts/commands/checkup_python.prompt @@ -21,7 +21,7 @@ - `--fixer` (explicit fixer role; overrides the second `--reviewers` role) - `--reviewer-fallback` (optional secondary reviewer role invoked once if the primary reviewer cannot complete; must differ from the resolved reviewer and fixer) - `--max-review-rounds` (int, default `5`) - - `--max-review-cost` (float, default `10.0`) + - `--max-review-cost` (float, default `0.0`; deprecated compatibility flag, report-only) - `--max-review-minutes` (float, default `90.0`) - `--require-all-reviewers-clean/--no-require-all-reviewers-clean` (default true) - `--continue-on-reviewer-limit` (flag; reports provider/rate/context-limit/auth/network/sandbox reviewer failures as `degraded` instead of `failed`, but never marks the active reviewer clean or continues mutation without a completed review) @@ -33,7 +33,7 @@ - Mode validation: - If `--validate-arch-includes` is set: `target`, `--pr`, `--issue` must all be `None`; dispatch to `run_validate_arch_includes_cli(...)` and return. - If `--review-only` is set without `--review-loop`: reject it. - - If `--review-loop` is set: require PR mode (`--pr` and `--issue`); reject `--no-fix` unless `--review-only` is also set; validate max-review values are positive (`--max-review-rounds >= 1`, cost/minutes > 0). + - If `--review-loop` is set: require PR mode (`--pr` and `--issue`); reject `--no-fix` unless `--review-only` is also set; validate max-review values (`--max-review-rounds >= 1`, `--max-review-cost >= 0`, minutes > 0). - If `--pr` or `--issue` is set: BOTH must be present; `target` must be `None`; validate `--pr` via `_parse_pr_url(pr_url)` (mention "GitHub pull-request URL" in error); validate `--issue` via `_is_github_issue_url(issue_url_opt)` (mention "GitHub issue URL" in error); use `issue_url_opt` as the effective issue URL. - Otherwise (default issue mode): require `target`; validate via `_is_github_issue_url(target)` (BadParameter with `param_hint="'TARGET'"` and message must mention "GitHub issue URL"). - Force `--no-fix` when `--pr` is set and `--review-loop` is NOT set (legacy PR verification still cannot push fixes). Print a warning to stderr explaining why and how to re-invoke if fixes were intended. diff --git a/pdd/prompts/pdd_completion_fish.prompt b/pdd/prompts/pdd_completion_fish.prompt index 4deaf61ca..4f0552962 100644 --- a/pdd/prompts/pdd_completion_fish.prompt +++ b/pdd/prompts/pdd_completion_fish.prompt @@ -17,6 +17,6 @@ % The generated pdd_completion.fish script must provide comprehensive auto-completion support for: % - All commands of the PDD CLI (as per the README). % - All command-specific options (as per the README). -% - The `checkup` command and its PR/review-loop options: `--pr`, `--issue`, `--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`, review round/cost/minute limits, reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. Emit only one `# checkup command` completion block — do not append a second checkup section for additional options, fish does not merge them and duplicates produce stale suggestions. +% - The `checkup` command and its PR/review-loop options: `--pr`, `--issue`, `--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`, review round/minute limits, deprecated report-only cost telemetry (`--max-review-cost`), reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. Emit only one `# checkup command` completion block — do not append a second checkup section for additional options, fish does not merge them and duplicates produce stale suggestions. % - All global options, including those found in the README and the newly specified `--time` option. % - Filenames, where appropriate for command arguments. diff --git a/pdd/prompts/pdd_completion_zsh.prompt b/pdd/prompts/pdd_completion_zsh.prompt index 979f0387b..92e2041f2 100644 --- a/pdd/prompts/pdd_completion_zsh.prompt +++ b/pdd/prompts/pdd_completion_zsh.prompt @@ -23,6 +23,6 @@ % - The entry should follow the standard Zsh completion format, incorporating the description, for example: `'--time[Controls LLM reasoning effort (0.0-1.0)]'`. % - This `--time` option should be available for completion globally, meaning it can be used with the main `pdd` command and alongside any of its subcommands. % -% The generated script must also include the `checkup` subcommand and its README-documented options, including PR mode (`--pr`, `--issue`), review-loop mode (`--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`), review budget/limit flags, reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. +% The generated script must also include the `checkup` subcommand and its README-documented options, including PR mode (`--pr`, `--issue`), review-loop mode (`--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`), review round/minute limit flags, deprecated report-only cost telemetry (`--max-review-cost`), reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. % % Define `_pdd_checkup` exactly once. zsh autoload resolves duplicate function definitions by overwriting earlier ones, so do not emit a second `_pdd_checkup()` block alongside or below the canonical definition — every flag must live inside a single function. diff --git a/tests/test_checkup_review_loop.py b/tests/test_checkup_review_loop.py index 642d463c5..0597a7bc8 100644 --- a/tests/test_checkup_review_loop.py +++ b/tests/test_checkup_review_loop.py @@ -217,7 +217,7 @@ def fake_task(role: str, instruction: str, cwd: Path, **kwargs: Any): assert not any("review-claude" in label for _, label in calls) assert not any("fresh-final" in label for _, label in calls) - def test_cost_cap_after_review_stops_before_fixer_or_push( + def test_cost_does_not_stop_before_fixer_or_push( self, monkeypatch: Any, tmp_path: Path ) -> None: from pdd.checkup_review_loop import run_checkup_review_loop @@ -235,15 +235,15 @@ def test_cost_cap_after_review_stops_before_fixer_or_push( } def fake_task(role: str, instruction: str, cwd: Path, **kwargs: Any): - calls.append(kwargs["label"]) + label = kwargs["label"] + calls.append(label) + if "fix-" in label: + return True, '{"summary":"fixed","changed_files":["tests/test_flow.py"]}', 0.2, role + if "verify-" in label: + return True, _json("clean"), 0.1, role return True, _json("findings", [finding]), 1.0, role monkeypatch.setattr(mod, "_run_role_task", fake_task) - monkeypatch.setattr( - mod, - "_commit_and_push_if_changed", - lambda *a, **k: pytest.fail("must not push after review cost cap"), - ) success, report, cost, _model = run_checkup_review_loop( context=_ctx(tmp_path), @@ -254,14 +254,17 @@ def fake_task(role: str, instruction: str, cwd: Path, **kwargs: Any): ) assert success is True - assert cost == 1.0 - assert calls == ["checkup-review-loop-review-codex-round1"] - assert "max-cost-reached: true" in report - assert "issue_aligned: unknown" in report - assert "Review loop stopped on a configured safety limit" not in report - assert "The PR does not test the new workflow." in report + assert round(cost, 2) == 1.3 + assert calls == [ + "checkup-review-loop-review-codex-round1", + "checkup-review-loop-fix-claude-for-codex-round1", + "checkup-review-loop-verify-codex-round1", + ] + assert "max-cost-reached: false" in report + assert "issue_aligned: true" in report + assert "The PR does not test the new workflow." not in report - def test_cost_cap_after_fixer_pushes_then_stops_before_verifier( + def test_cost_does_not_stop_after_fixer_push_before_verifier( self, monkeypatch: Any, tmp_path: Path ) -> None: from pdd.checkup_review_loop import run_checkup_review_loop @@ -283,6 +286,8 @@ def fake_task(role: str, instruction: str, cwd: Path, **kwargs: Any): calls.append(label) if "fix-" in label: return True, '{"summary":"fixed","changed_files":["pdd/api.py"]}', 1.0, role + if "verify-" in label: + return True, _json("clean"), 0.1, role return True, _json("findings", [finding]), 0.1, role pushes: List[str] = [] @@ -303,16 +308,17 @@ def fake_task(role: str, instruction: str, cwd: Path, **kwargs: Any): ) assert success is True - assert round(cost, 2) == 1.1 + assert round(cost, 2) == 1.2 assert calls == [ "checkup-review-loop-review-codex-round1", "checkup-review-loop-fix-claude-for-codex-round1", + "checkup-review-loop-verify-codex-round1", ] assert pushes == ["pushed"] - assert "max-cost-reached: true" in report - assert "issue_aligned: unknown" in report - assert "The API accepts invalid input." in report - assert "verification=unverified" in report + assert "max-cost-reached: false" in report + assert "issue_aligned: true" in report + assert "The API accepts invalid input." not in report + assert "verification=verified" in report def test_codex_findings_are_given_to_claude_then_verified_by_codex( self, monkeypatch: Any, tmp_path: Path