Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2746,7 +2746,7 @@ Options:
- `--reviewers ROLES`: Legacy comma-separated review-loop role order, interpreted as `reviewer,fixer` (default: `codex,claude`).
- `--reviewer-fallback ROLE`: Optional secondary reviewer role to invoke once if the primary reviewer cannot complete (for example, because of auth, network, sandbox, or CLI failures). The fallback must resolve to a role different from the reviewer and fixer; if it succeeds, it becomes the active reviewer for the remaining loop and the superseded primary's row in the final report is annotated `(optional, superseded by <fallback>)` so downstream verdict adapters drop the failed primary from the required-reviewer set and resolve to `ship_degraded` instead of `unknown`.
- `--max-review-rounds INT`: Maximum primary-reviewer/fixer rounds (default: 5).
- `--max-review-cost FLOAT`: Maximum review-loop LLM cost in USD (default: 10.0).
- `--max-review-cost FLOAT`: Deprecated compatibility flag. Review-loop cost is reported as telemetry and does not stop the codex-review / claude-fix / codex-verify loop (default: 0.0).
- `--max-review-minutes FLOAT`: Maximum review-loop wall-clock minutes (default: 90.0).
- `--blocking-severities LIST`: Comma-separated severity names used for review-loop reporting and prompt guidance (default: `blocker,critical,medium`). The fixer still receives every valid reviewer finding.
- `--continue-on-reviewer-limit`: Report provider, rate, context-window, timeout, auth, network, sandbox, permission, and non-zero-exit reviewer failures as `degraded` instead of `failed`. Degraded reviewers are still not clean unless a configured fallback reviewer completes successfully and takes over as the active reviewer.
Expand Down
3 changes: 1 addition & 2 deletions examples/checkup_review_loop_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def main() -> None:
config = ReviewLoopConfig(
reviewers=parse_reviewers("codex,claude"),
max_rounds=1,
max_cost=1.0,
max_minutes=5.0,
)

Expand Down Expand Up @@ -94,4 +93,4 @@ def main() -> None:
console.print(f"Model: {model}")

if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion pdd/agentic_checkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def run_agentic_checkup(
fixer: Optional[str] = None,
reviewer_fallback: Optional[str] = None,
max_review_rounds: int = 5,
max_review_cost: float = 10.0,
max_review_cost: float = 0.0,
max_review_minutes: float = 90.0,
require_all_reviewers_clean: bool = True,
continue_on_reviewer_limit: bool = False,
Expand Down
5 changes: 2 additions & 3 deletions pdd/agentic_split_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1404,9 +1404,8 @@ def _check_budget(at_step: str) -> Optional[Tuple[bool, str, float, str, List[st
"""Budget guard — return abort tuple if --max-cost crossed, else None.

Persists state so the next run without --max-cost (or with a higher
one) resumes from ``at_step``. Mirrors the pattern in
``pdd/checkup_review_loop.py:1849-1862`` (max_cost_reached flag +
stop_reason).
one) resumes from ``at_step`` with the max_cost_reached flag and
stop_reason recorded.

MUST be called both at the top of each step iteration AND after every
``total_cost += ...`` accumulation that precedes a possible early
Expand Down
13 changes: 6 additions & 7 deletions pdd/checkup_review_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,9 @@ class ReviewLoopConfig:
reviewer_fallback: Optional[str] = None
review_only: bool = False
max_rounds: int = 5
max_cost: float = 10.0
# Deprecated compatibility field. Cost is reported but does not stop the
# implement/review loop; max_rounds and max_minutes bound normal execution.
max_cost: float = 0.0
max_minutes: float = 90.0
# Kept for CLI/API compatibility. The loop has one authoritative reviewer,
# so this is no longer used as a ship gate.
Expand Down Expand Up @@ -2998,21 +3000,18 @@ def _fix_dispute_note(fix: FixResult, finding: ReviewFinding) -> str:


def _budget_exhausted(
config: ReviewLoopConfig,
state: ReviewLoopState,
_config: ReviewLoopConfig,
_state: ReviewLoopState,
deadline: float,
) -> bool:
return state.total_cost >= config.max_cost or time.monotonic() >= deadline
return time.monotonic() >= deadline


def _mark_budget_exhausted(
config: ReviewLoopConfig,
state: ReviewLoopState,
deadline: float,
) -> None:
if state.total_cost >= config.max_cost:
state.max_cost_reached = True
state.stop_reason = f"Max review cost reached: ${config.max_cost:.2f}."
if time.monotonic() >= deadline:
state.max_duration_reached = True
state.stop_reason = f"Max review duration reached: {config.max_minutes:g} minutes."
Expand Down
11 changes: 7 additions & 4 deletions pdd/commands/checkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,12 @@
@click.option(
"--max-review-cost",
type=float,
default=10.0,
default=0.0,
show_default=True,
help="Maximum review-loop LLM cost in USD.",
help=(
"Deprecated compatibility flag. Review-loop cost is reported but does "
"not stop the implement/review loop."
),
)
@click.option(
"--max-review-minutes",
Expand Down Expand Up @@ -279,9 +282,9 @@ def checkup(
"--max-review-rounds must be >= 1.",
param_hint="'--max-review-rounds'",
)
if review_loop and max_review_cost <= 0:
if review_loop and max_review_cost < 0:
raise click.BadParameter(
"--max-review-cost must be > 0.",
"--max-review-cost must be >= 0.",
param_hint="'--max-review-cost'",
)
if review_loop and max_review_minutes <= 0:
Expand Down
2 changes: 1 addition & 1 deletion pdd/pdd_completion.fish
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ complete -c pdd -n "__fish_seen_subcommand_from checkup" -l reviewer -x -d "Prim
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l fixer -x -d "Fixer role"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l reviewer-fallback -x -d "Fallback reviewer when primary fails"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-rounds -x -d "Max review rounds"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-cost -x -d "Max review cost (USD)"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-cost -x -d "Deprecated; review cost is report-only"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l max-review-minutes -x -d "Max review wall-clock minutes"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l require-all-reviewers-clean -d "Require all reviewers clean"
complete -c pdd -n "__fish_seen_subcommand_from checkup" -l no-require-all-reviewers-clean -d "Do not require all reviewers clean"
Expand Down
2 changes: 1 addition & 1 deletion pdd/pdd_completion.zsh
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ _pdd_checkup() {
'--fixer[Fixer role]:fixer:' \
'--reviewer-fallback[Fallback reviewer when primary fails]:reviewer:' \
'--max-review-rounds[Max review rounds]:rounds:' \
'--max-review-cost[Max review cost (USD)]:cost:' \
'--max-review-cost[Deprecated; review cost is report-only]:cost:' \
'--max-review-minutes[Max review wall-clock minutes]:minutes:' \
'--require-all-reviewers-clean[Require all reviewers clean]' \
'--no-require-all-reviewers-clean[Do not require all reviewers clean]' \
Expand Down
2 changes: 1 addition & 1 deletion pdd/prompts/agentic_checkup_python.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Write the `pdd/agentic_checkup.py` module.
Entry point for the agentic checkup workflow. Accepts a GitHub issue URL, fetches issue content and comments, loads project context (architecture.json, .pddrc), then dispatches to the multi-step orchestrator that explores the project, identifies problems, and optionally fixes them.

% Requirements
1. Function: `run_agentic_checkup(issue_url: str, *, verbose: bool = False, quiet: bool = False, no_fix: bool = False, timeout_adder: float = 0.0, use_github_state: bool = True, reasoning_time: Optional[float] = None, pr_url: Optional[str] = None, review_loop: bool = False, review_only: bool = False, reviewers: str = "codex,claude", reviewer: Optional[str] = None, fixer: Optional[str] = None, reviewer_fallback: Optional[str] = None, max_review_rounds: int = 5, max_review_cost: float = 10.0, max_review_minutes: float = 90.0, require_all_reviewers_clean: bool = True, continue_on_reviewer_limit: bool = False, require_final_fresh_review: bool = True, blocking_severities: Optional[str] = None, clean_reviewer_states: Optional[str] = None, fallback_reviewer_on_failure: bool = False) -> Tuple[bool, str, float, str]`
1. Function: `run_agentic_checkup(issue_url: str, *, verbose: bool = False, quiet: bool = False, no_fix: bool = False, timeout_adder: float = 0.0, use_github_state: bool = True, reasoning_time: Optional[float] = None, pr_url: Optional[str] = None, review_loop: bool = False, review_only: bool = False, reviewers: str = "codex,claude", reviewer: Optional[str] = None, fixer: Optional[str] = None, reviewer_fallback: Optional[str] = None, max_review_rounds: int = 5, max_review_cost: float = 0.0, max_review_minutes: float = 90.0, require_all_reviewers_clean: bool = True, continue_on_reviewer_limit: bool = False, require_final_fresh_review: bool = True, blocking_severities: Optional[str] = None, clean_reviewer_states: Optional[str] = None, fallback_reviewer_on_failure: bool = False) -> Tuple[bool, str, float, str]`
2. Return 4-tuple: (success, message, total_cost, model_used)
3. Check `gh` CLI availability via `_check_gh_cli()` (reused from `agentic_change.py`)
4. Parse GitHub issue URL via `_parse_issue_url()` (reused from `agentic_change.py`)
Expand Down
2 changes: 1 addition & 1 deletion pdd/prompts/agentic_split_orchestrator_python.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Orchestrator for the agentic split workflow. Diagnoses whether a PDD dev unit ha
- `delete_dead`: opt-in dead symbol deletion in step 6 (last child only)
- `intent`: user hint (CLI flag); if set, step 0 is skipped
- `no_phase_extraction`: skip step 6a (phase decomposition)
- `max_cost` (Issue #1384): when not None, the orchestrator checks `state.total_cost >= max_cost` BOTH at the top of every iteration of the step loop AND immediately after every `total_cost += ...` accumulation that precedes a possible early return (diagnose_only, propose_only, LEAVE_ALONE, hard-stop) AND after each per-child fan-out cost in steps 6a/6/8 (post per-child state persistence). Without the post-accumulation check, an LLM step that adds cost and then immediately exits via early return — or a multi-child fan-out — can bypass the cap by one full step's cost (or N × per-child cost). Implement as a closure `_check_budget(at_step) -> Optional[abort_tuple]` that returns the standard 5-tuple when crossed, else `None`. On hit: sets `state["max_cost_reached"] = True`, persists state via `save_workflow_state`, restores signal handlers, calls `clear_agentic_progress()`, and returns the standard 5-tuple with `success=False` and a stop message of the form `"Aborted at step <step>: --max-cost ${cap:.2f} reached (spent ${cost:.2f}). Re-run without --max-cost or with a higher cap to resume."`. The next invocation without `--max-cost` (or with a higher value) resumes from the same state. State persistence ordering matters: budget check fires AFTER `state["step_outputs"][step] = output` AND `state["last_completed_step"] = step` so resume picks up the next step rather than re-running the just-completed one. For `9_refine_check`, also parse the paid `RefineCheck` output and persist `_pending_refine` BEFORE that post-step budget check; otherwise a max-cost abort after step 9 can save `last_completed_step="9_refine_check"` but lose the focused-refinement request. Mirrors the `max_cost_reached` pattern in `pdd/checkup_review_loop.py:1849-1862`.
- `max_cost` (Issue #1384): when not None, the orchestrator checks `state.total_cost >= max_cost` BOTH at the top of every iteration of the step loop AND immediately after every `total_cost += ...` accumulation that precedes a possible early return (diagnose_only, propose_only, LEAVE_ALONE, hard-stop) AND after each per-child fan-out cost in steps 6a/6/8 (post per-child state persistence). Without the post-accumulation check, an LLM step that adds cost and then immediately exits via early return — or a multi-child fan-out — can bypass the cap by one full step's cost (or N × per-child cost). Implement as a closure `_check_budget(at_step) -> Optional[abort_tuple]` that returns the standard 5-tuple when crossed, else `None`. On hit: sets `state["max_cost_reached"] = True`, persists state via `save_workflow_state`, restores signal handlers, calls `clear_agentic_progress()`, and returns the standard 5-tuple with `success=False` and a stop message of the form `"Aborted at step <step>: --max-cost ${cap:.2f} reached (spent ${cost:.2f}). Re-run without --max-cost or with a higher cap to resume."`. The next invocation without `--max-cost` (or with a higher value) resumes from the same state. State persistence ordering matters: budget check fires AFTER `state["step_outputs"][step] = output` AND `state["last_completed_step"] = step` so resume picks up the next step rather than re-running the just-completed one. For `9_refine_check`, also parse the paid `RefineCheck` output and persist `_pending_refine` BEFORE that post-step budget check; otherwise a max-cost abort after step 9 can save `last_completed_step="9_refine_check"` but lose the focused-refinement request. This split-specific cap is independent from `pdd checkup --review-loop`, where review cost is report-only telemetry.

5. **Path sanitization helpers** (Issue #1384): expose two module-level helpers that the orchestrator MUST call between the Step 4 LLM output and any filesystem operation:
- `_sanitize_path_string(value)`: when `value` is a `str`, strip whitespace, remove leading markdown backticks, then strip prose tails by truncating at the first whitespace / em-dash / hash / remaining backtick character. This preserves common markdown-wrapped paths like `` `pdd/foo.py` `` as `pdd/foo.py` while still dropping prose tails. Non-string inputs pass through unchanged. Empty strings remain empty.
Expand Down
6 changes: 2 additions & 4 deletions pdd/prompts/checkup_review_loop_python.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,10 @@ Add a PR-mode primary-reviewer/fixer review loop for `pdd checkup`.
17b. The fixer prompt MUST ask the fixer to address every valid, in-scope finding when practical, while prioritizing the configured blocking severities because those determine whether the loop continues. It must explain that "focused fixes" means avoiding unrelated refactors and broad churn, not leaving real issues unfixed. If the fixer leaves any valid finding unfixed, it must return `partially_fixed` or `blocked` with a concrete rationale.
18. The verifier pass is also the next full review pass: it must verify prior fixes and then re-review the whole PR. If it returns actionable findings, those findings become the next round's fixer input. Do not run an extra reviewer at the start of the next round unless there is no pending verifier finding set. This models the manual Codex-review / Claude-fix / Codex-review workflow and avoids two independent reviewers arguing.
19. A reviewer that hits a provider/rate/quota/context-window limit, times out, or returns unparsable output produces `status` in `{"failed", "degraded", "missing"}` (use `_failure_status` to classify based on output keywords and `continue_on_reviewer_limit`). With the default `continue_on_reviewer_limit=False`, provider/rate/quota/context-window/timeout failures are `failed`; with the flag enabled they are `degraded`. Such statuses are NOT clean even if `clean_reviewer_states` is widened — preserve safety. Do not classify normal finding prose as degraded merely because it contains the word "context"; require context-window/length/limit wording.
20. Budget caps:
20. Loop bounds:
- `max_rounds` (int, ≥ 1)
- `max_cost` (float USD, > 0)
- `max_minutes` (float wall clock, > 0)
Crossing any cap stops the loop, sets the relevant `*_reached` flag on state, and renders a final report whose reviewer-status reflects the unfinished round.
20a. If a cap is crossed during a successful fixer call, the loop MUST still commit and push the completed fixer changes before stopping. Push/commit is local git/GitHub plumbing, not another LLM spend. Stop before verifier/fresh-review when the cap is reached after push, so the final report correctly marks the fix as unverified but does not strand completed changes only in the local worktree.
Cost is reported but MUST NOT stop the normal implement/review loop. The loop continues until the reviewer reports merge-ready/no actionable findings, `max_rounds` is reached, a hard reviewer/fixer/push failure occurs, or the wall-clock duration is reached. `max_cost` is a deprecated compatibility field and does not set `max_cost_reached`.

% Persisted artifacts
21. Artifacts live under `<git_root>/.pdd/checkup-review-loop/issue-{issue_number}-pr-{pr_number}/`. The directory is created before the first reviewer runs.
Expand Down
4 changes: 2 additions & 2 deletions pdd/prompts/commands/checkup_python.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
- `--fixer` (explicit fixer role; overrides the second `--reviewers` role)
- `--reviewer-fallback` (optional secondary reviewer role invoked once if the primary reviewer cannot complete; must differ from the resolved reviewer and fixer)
- `--max-review-rounds` (int, default `5`)
- `--max-review-cost` (float, default `10.0`)
- `--max-review-cost` (float, default `0.0`; deprecated compatibility flag, report-only)
- `--max-review-minutes` (float, default `90.0`)
- `--require-all-reviewers-clean/--no-require-all-reviewers-clean` (default true)
- `--continue-on-reviewer-limit` (flag; reports provider/rate/context-limit/auth/network/sandbox reviewer failures as `degraded` instead of `failed`, but never marks the active reviewer clean or continues mutation without a completed review)
Expand All @@ -33,7 +33,7 @@
- Mode validation:
- If `--validate-arch-includes` is set: `target`, `--pr`, `--issue` must all be `None`; dispatch to `run_validate_arch_includes_cli(...)` and return.
- If `--review-only` is set without `--review-loop`: reject it.
- If `--review-loop` is set: require PR mode (`--pr` and `--issue`); reject `--no-fix` unless `--review-only` is also set; validate max-review values are positive (`--max-review-rounds >= 1`, cost/minutes > 0).
- If `--review-loop` is set: require PR mode (`--pr` and `--issue`); reject `--no-fix` unless `--review-only` is also set; validate max-review values (`--max-review-rounds >= 1`, `--max-review-cost >= 0`, minutes > 0).
- If `--pr` or `--issue` is set: BOTH must be present; `target` must be `None`; validate `--pr` via `_parse_pr_url(pr_url)` (mention "GitHub pull-request URL" in error); validate `--issue` via `_is_github_issue_url(issue_url_opt)` (mention "GitHub issue URL" in error); use `issue_url_opt` as the effective issue URL.
- Otherwise (default issue mode): require `target`; validate via `_is_github_issue_url(target)` (BadParameter with `param_hint="'TARGET'"` and message must mention "GitHub issue URL").
- Force `--no-fix` when `--pr` is set and `--review-loop` is NOT set (legacy PR verification still cannot push fixes). Print a warning to stderr explaining why and how to re-invoke if fixes were intended.
Expand Down
2 changes: 1 addition & 1 deletion pdd/prompts/pdd_completion_fish.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@
% The generated pdd_completion.fish script must provide comprehensive auto-completion support for:
% - All commands of the PDD CLI (as per the README).
% - All command-specific options (as per the README).
% - The `checkup` command and its PR/review-loop options: `--pr`, `--issue`, `--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`, review round/cost/minute limits, reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. Emit only one `# checkup command` completion block — do not append a second checkup section for additional options, fish does not merge them and duplicates produce stale suggestions.
% - The `checkup` command and its PR/review-loop options: `--pr`, `--issue`, `--review-loop`, `--review-only`, `--reviewers`, `--reviewer`, `--fixer`, `--reviewer-fallback`, review round/minute limits, deprecated report-only cost telemetry (`--max-review-cost`), reviewer-status compatibility flags (`--continue-on-reviewer-limit`, `--fallback-reviewer-on-failure`), `--blocking-severities`, and `--clean-reviewer-states`. Emit only one `# checkup command` completion block — do not append a second checkup section for additional options, fish does not merge them and duplicates produce stale suggestions.
% - All global options, including those found in the README and the newly specified `--time` option.
% - Filenames, where appropriate for command arguments.
Loading
Loading