diff --git a/.pdd/meta/llm_invoke_python.json b/.pdd/meta/llm_invoke_python.json
index 21c916074..f47b772f0 100644
--- a/.pdd/meta/llm_invoke_python.json
+++ b/.pdd/meta/llm_invoke_python.json
@@ -1,13 +1,13 @@
{
- "pdd_version": "0.0.241",
- "timestamp": "2026-05-17T18:33:34.290918+00:00",
- "command": "example",
- "prompt_hash": "08f20f2e33886688d14103186bc55760a745b23e9bbe2626a0a7a6c4e1c1528c",
- "code_hash": "42370b9d59b0caca264991021fb612f02bdae906c8043451288a64697af894ae",
+ "pdd_version": "0.0.244.dev7",
+ "timestamp": "2026-05-19T22:00:00.000000+00:00",
+ "command": "fix",
+ "prompt_hash": "8769b1adcb99a783edbfc75a0db123357c6a9959af78bba15ae2c80de00d802b",
+ "code_hash": "4c853854bca72a48109d0e6c6ac71c13869e37ff1bf2e5e17ad8b776500690ef",
"example_hash": "48c5aece0ddd153f95ec8a53802d2173d4da12a920b398c5feebb307b9958417",
- "test_hash": "630ae15410752b1bda2608d72ab104add66b1f9cbbd134c003183339b46353f9",
+ "test_hash": "b612ca94d95d2902db9219088726a76eee2bf636ead5a1495b2087f4c6cddf6b",
"test_files": {
- "test_llm_invoke.py": "630ae15410752b1bda2608d72ab104add66b1f9cbbd134c003183339b46353f9",
+ "test_llm_invoke.py": "b612ca94d95d2902db9219088726a76eee2bf636ead5a1495b2087f4c6cddf6b",
"test_llm_invoke_csv_model_registration.py": "1583b5e076fe5227a11f3d1079034ab4a21bc6131a3433f37bd68e35a99ba49e",
"test_llm_invoke_integration.py": "2eb4bd2565761a4148762c6fb73c887cb6e12ff962742e05f5c2d4d62b47aaf9",
"test_llm_invoke_nested_schema.py": "c983a19874abacc3e0ea9d6ca2ec87495960d2dd96d4e93be4039ed1bc995b9b",
@@ -21,4 +21,4 @@
"pdd/core/cloud.py": "0487c0b989996af144df3af1c818a6eeafe52fd209710e5a54eb43990c89ae97",
"pdd/server/token_counter.py": "3391a7c708713e3d370bf9e981a837f1f7ade08af75d69f402301dbf65a79c9a"
}
-}
\ No newline at end of file
+}
diff --git a/.pdd/meta/summarize_directory_python.json b/.pdd/meta/summarize_directory_python.json
index 489dbec66..ad272f549 100644
--- a/.pdd/meta/summarize_directory_python.json
+++ b/.pdd/meta/summarize_directory_python.json
@@ -1,17 +1,17 @@
{
- "pdd_version": "0.0.228",
- "timestamp": "2026-05-06T03:28:21.684198+00:00",
- "command": "regenerate-public",
- "prompt_hash": "ea40b7699d4cf1ab98a7eed076aea7117a1c175ae16956afcfc7f06856837b17",
- "code_hash": "a96e3b5ddb13717395fbc046cf0279486e1225d63deb51657f1ddb4eaf80c0af",
+ "pdd_version": "0.0.244.dev7",
+ "timestamp": "2026-05-19T22:00:00.000000+00:00",
+ "command": "fix",
+ "prompt_hash": "8aaf8f0055d1d10efb75b4219753f7a03c9a0094d730939b85e1952f7b8e004f",
+ "code_hash": "89663dc352f5a79a0bac618840d05bafbb900d47425d31e12b6229aac9797fa7",
"example_hash": null,
- "test_hash": "6198ba0d33bcb2d06b6a90106dd2ae7adfc112a1ca8de62c58b23b8f9c2502a9",
+ "test_hash": "16f0ccf2eaf2c2a1498528da7913ac22aca8cffcc26941bb34a961c1ced032fb",
"test_files": {
- "test_summarize_directory.py": "6198ba0d33bcb2d06b6a90106dd2ae7adfc112a1ca8de62c58b23b8f9c2502a9"
+ "test_summarize_directory.py": "16f0ccf2eaf2c2a1498528da7913ac22aca8cffcc26941bb34a961c1ced032fb"
},
"include_deps": {
- "context/llm_invoke_example.py": "d749aa00a35c88c254c4aaa9e1280d85342d71214710e2f3f7a2bd9388a92168",
+ "context/llm_invoke_example.py": "48c5aece0ddd153f95ec8a53802d2173d4da12a920b398c5feebb307b9958417",
"context/load_prompt_template_example.py": "a1cd6619182c6c951f5856dda4070e202875a5884bbfab9cc191d24de2f4951f",
- "context/python_preamble.prompt": "57a3e51f529024ec0cb9658cd6ac61a7c8051ba0c8e887b31cf00b2e78a07d83"
+ "context/python_preamble.prompt": "0388ed131bf986f8752e1bc4c81e4da0460cfe2908ec8c60b1314edbab768254"
}
-}
\ No newline at end of file
+}
diff --git a/README.md b/README.md
index 7fc7fc684..41e03cbdd 100644
--- a/README.md
+++ b/README.md
@@ -804,8 +804,9 @@ The generated CSV file includes the following columns:
- cost: The estimated cost of the operation in USD (e.g., 0.05 for 5 cents). This will be zero for local models or operations that do not use a LLM.
- input_files: A list of input files involved in the operation
- output_files: A list of output files generated or modified by the operation
+- attempted_models: A semicolon-delimited, chronological list of every model PDD attempted for this command, including the final successful model as well as any earlier models that failed and were abandoned (e.g., `vertex_ai/gemini-2.5-pro;deepseek/deepseek-chat`). For a normal single-model success this contains a single entry; empty only when no LLM attempt was recorded for the command.
-This comprehensive output allows for detailed tracking of not only the cost and type of operations but also the specific files involved in each PDD command execution.
+This comprehensive output allows for detailed tracking of not only the cost and type of operations but also the specific files involved in each PDD command execution, plus the full fallback chain when PDD switched models mid-run.
### Environment Variable
diff --git a/architecture.json b/architecture.json
index 6e950719e..86c002c0f 100644
--- a/architecture.json
+++ b/architecture.json
@@ -1138,8 +1138,8 @@
}
},
{
- "reason": "Tracks LLM usage costs across operations.",
- "description": "Records token usage and calculates costs. Aggregates costs across workflow steps.",
+ "reason": "Tracks LLM usage costs across operations and records the attempted-model fallback chain.",
+ "description": "Click decorator that records timestamp, model, command, cost, input/output files, and the full attempted-model chain (semicolon-delimited) to the cost CSV. Accepts both legacy tuple and enriched-dict result shapes from decorated commands.",
"dependencies": [],
"priority": 33,
"filename": "track_cost_python.prompt",
@@ -1155,15 +1155,15 @@
{
"name": "track_cost",
"signature": "(func)",
- "returns": "None",
+ "returns": "Callable",
"sideEffects": [
"None"
]
},
{
"name": "extract_cost_and_model",
- "signature": "(result: Any) -> Tuple[Any, str]",
- "returns": "Tuple[Any, str]",
+ "signature": "(result: Any) -> Tuple[Any, str, List[str]]",
+ "returns": "Tuple[Any, str, List[str]]",
"sideEffects": [
"None"
]
@@ -2416,8 +2416,8 @@
},
{
"name": "cloud_fix_errors",
- "signature": "(unit_test: str, code: str, prompt: str, error: str, error_file: str, strength: float, temperature: float, verbose: bool = False, time: float = DEFAULT_TIME, code_file_ext: str = '.py') -> Tuple[bool, bool, str, str, str, float, str]",
- "returns": "Tuple[bool, bool, str, str, str, float, str]",
+ "signature": "(unit_test: str, code: str, prompt: str, error: str, error_file: str, strength: float, temperature: float, verbose: bool = False, time: float = DEFAULT_TIME, code_file_ext: str = '.py', protect_tests: bool = False, failure_classification: str | None = None) -> Tuple[bool, bool, str, str, str, float, str, List[str]]",
+ "returns": "Tuple[bool, bool, str, str, str, float, str, List[str]]",
"sideEffects": [
"None"
]
@@ -2440,8 +2440,8 @@
},
{
"name": "fix_error_loop",
- "signature": "(unit_test_file: str, code_file: str, prompt_file: str, prompt: str, verification_program: str, strength: float, temperature: float, max_attempts: int, budget: float, error_log_file: str = 'error_log.txt', verbose: bool = False, time: float = DEFAULT_TIME, agentic_fallback: bool = True, use_cloud: bool = False)",
- "returns": "None",
+ "signature": "(unit_test_file: str, code_file: str, prompt_file: str, prompt: str, verification_program: str, strength: float, temperature: float, max_attempts: int, budget: float, error_log_file: str = 'error_log.txt', verbose: bool = False, time: float = DEFAULT_TIME, agentic_fallback: bool = True, protect_tests: bool = False, use_cloud: bool = False, test_files: list[str] | None = None, failure_aware_retries: bool = True) -> Tuple[bool, str, str, int, float, str]",
+ "returns": "Tuple[bool, str, str, int, float, str]",
"sideEffects": [
"None"
]
@@ -8808,5 +8808,47 @@
"x": 24600,
"y": 800
}
+ },
+ {
+ "reason": "Provides unified LLM invocation across all PDD operations with provider abstraction, retries, and context window validation.",
+ "description": "Implements the llm_invoke entry point with model selection, structured output handling, cloud/local routing, attempted-model tracking, batch mode, cost accumulation, and Pydantic schema validation.",
+ "dependencies": [
+ "path_resolution_python.prompt",
+ "server/token_counter_python.prompt"
+ ],
+ "priority": 236,
+ "filename": "llm_invoke_python.prompt",
+ "filepath": "pdd/llm_invoke.py",
+ "tags": [
+ "module",
+ "python"
+ ],
+ "interface": {
+ "type": "module",
+ "module": {
+ "functions": [
+ {
+ "name": "llm_invoke",
+ "signature": "(prompt, input_json, strength, temperature, verbose, output_pydantic, output_schema, time, use_batch_mode, messages, language, use_cloud)",
+ "returns": "Dict[str, Any]"
+ },
+ {
+ "name": "setup_file_logging",
+ "signature": "(log_file_path=None)",
+ "returns": "None"
+ },
+ {
+ "name": "set_verbose_logging",
+ "signature": "(verbose=False)",
+ "returns": "None"
+ },
+ {
+ "name": "set_quiet_logging",
+ "signature": "()",
+ "returns": "None"
+ }
+ ]
+ }
+ }
}
]
diff --git a/context/cli_example.py b/context/cli_example.py
index 38a9bc357..398473024 100644
--- a/context/cli_example.py
+++ b/context/cli_example.py
@@ -181,6 +181,9 @@ def example_cost_tracking_setup():
- cost: Estimated cost in USD (e.g., 0.05 for 5 cents)
- input_files: List of input files involved
- output_files: List of output files generated/modified
+ - attempted_models: Semicolon-delimited model attempt chain. A
+ single-model success contains the successful model once; this field
+ is empty only when no LLM attempt was recorded.
Args:
None
@@ -355,4 +358,4 @@ def main():
if __name__ == '__main__':
- main()
\ No newline at end of file
+ main()
diff --git a/context/core/cli_example.py b/context/core/cli_example.py
index d0b8d6ad1..398473024 100644
--- a/context/core/cli_example.py
+++ b/context/core/cli_example.py
@@ -181,6 +181,9 @@ def example_cost_tracking_setup():
- cost: Estimated cost in USD (e.g., 0.05 for 5 cents)
- input_files: List of input files involved
- output_files: List of output files generated/modified
+ - attempted_models: Semicolon-delimited model attempt chain. A
+ single-model success contains the successful model once; this field
+ is empty only when no LLM attempt was recorded.
Args:
None
diff --git a/context/track_cost_example.py b/context/track_cost_example.py
index c35b4db9e..b58c2dcdb 100644
--- a/context/track_cost_example.py
+++ b/context/track_cost_example.py
@@ -1,72 +1,166 @@
-import click
-from typing import Optional, Tuple
+"""
+Example demonstrating how to use the ``track_cost`` decorator from pdd.track_cost.
+
+``track_cost`` is a Click command decorator. It wraps a Click command function so
+that every invocation appends a row to a CSV file containing:
+
+ timestamp, model, command, cost, input_files, output_files, attempted_models
+
+Inputs:
+ func (Callable): the Click command being wrapped.
+
+Outputs:
+ Callable: a wrapper that forwards the command's return value unchanged.
+ Side effect: a CSV row appended to the path configured via
+ ``ctx.obj['output_cost']`` (preferred) or the ``PDD_OUTPUT_COST_PATH``
+ environment variable.
+
+The decorator skips CSV writing entirely when ``PYTEST_CURRENT_TEST`` is set,
+so this example clears that env var to demonstrate the real behavior.
+"""
import os
-from pdd.track_cost import track_cost # Absolute import of the track_cost decorator
-from rich import print as rprint
+import sys
+import csv
+import tempfile
+
+# Make ``pdd.track_cost`` importable regardless of the working directory.
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import click # noqa: E402
+from click.testing import CliRunner # noqa: E402
+
+from pdd.track_cost import track_cost # noqa: E402
+
@click.group()
-@click.option('--output-cost', type=click.Path(), help='Enable cost tracking and output a CSV file with usage details.')
+@click.option(
+ "--output-cost",
+ type=click.Path(),
+ help="Enable cost tracking and write a CSV with usage details.",
+)
@click.pass_context
def cli(ctx, output_cost):
- """PDD Command-Line Interface.
-
- PDD is a tool for processing prompts and generating outputs with cost tracking.
- """
+ """Demo CLI for the track_cost decorator."""
ctx.ensure_object(dict)
- ctx.obj['output_cost'] = output_cost
+ ctx.obj["output_cost"] = output_cost
+
@cli.command()
-@click.option(
- '--prompt-file',
- type=click.Path(exists=True),
- required=True,
- help='Path to the input prompt file.'
-)
-@click.option(
- '--output',
- type=click.Path(),
- required=False,
- help='Path to the output file.'
-)
+@click.option("--prompt-file", type=click.Path(exists=True), required=True)
+@click.option("--output", type=click.Path(), required=False)
@click.pass_context
@track_cost
-def generate(ctx, prompt_file: str, output: Optional[str]) -> Tuple[str, float, str]:
+def generate(ctx, prompt_file, output):
+ """Legacy result shape: returns (output, cost, model_name).
+
+ Returns:
+ Tuple[str, float, str]: (output text, cost in USD, model name).
"""
- Generate output based on the provided prompt file.
-
- This command reads a prompt from the specified input file, processes it,
- and writes the result to the output file. It also returns the cost of
- the operation and the model used.
-
- Parameters:
- prompt_file (str): Path to the input prompt file.
- output (Optional[str]): Path to the output file. If not provided, output is printed to console.
-
+ with open(prompt_file, "r", encoding="utf-8") as fh:
+ prompt = fh.read()
+ generated = f"Processed prompt: {prompt[:40]}"
+ if output:
+ with open(output, "w", encoding="utf-8") as out_fh:
+ out_fh.write(generated)
+ # Legacy result shape: (..., cost, model_name).
+ return generated, 0.05, "gpt-4"
+
+
+@cli.command()
+@click.option("--prompt-file", type=click.Path(exists=True), required=True)
+@click.option("--output", type=click.Path(), required=False)
+@click.pass_context
+@track_cost
+def generate_enriched(ctx, prompt_file, output):
+ """Enriched result shape: last element is a dict.
+
+ The dict contains ``cost``, ``model_name`` and ``attempted_models`` so
+ track_cost can record the full fallback chain.
+
Returns:
- Tuple[str, float, str]:
- - Generated output as a string.
- - Cost of execution in dollars per million tokens.
- - Model name used for generation.
+ Tuple[str, dict]: (output text, metadata dict).
"""
- # Simulate processing the prompt and generating output
- with open(prompt_file, 'r', encoding='utf-8') as file:
- prompt = file.read()
-
- # Placeholder for actual generation logic
- generated_output = f"Processed prompt: {prompt}"
-
- # Simulate cost and model name
- cost = 0.05 # Dollars per million tokens
- model_name = "gpt-4"
-
- # Write output to file if specified
+ with open(prompt_file, "r", encoding="utf-8") as fh:
+ prompt = fh.read()
+ generated = f"Processed (enriched): {prompt[:40]}"
if output:
- with open(output, 'w', encoding='utf-8') as out_file:
- out_file.write(generated_output)
- else:
- rprint(generated_output)
-
- return generated_output, cost, model_name
-
-if __name__ == '__main__':
- cli(['--output-cost', 'cost.csv', 'generate', '--prompt-file', 'README.md', '--output', 'output.txt'])
\ No newline at end of file
+ with open(output, "w", encoding="utf-8") as out_fh:
+ out_fh.write(generated)
+ metadata = {
+ "cost": 0.12,
+ "model_name": "claude-sonnet",
+ # First attempt failed, then we fell back to claude-sonnet.
+ "attempted_models": ["claude-opus", "claude-sonnet"],
+ }
+ return generated, metadata
+
+
+def _print_csv(path):
+ print(f"CSV file: {path}")
+ print("Contents:")
+ with open(path, "r", encoding="utf-8") as fh:
+ reader = csv.reader(fh)
+ for row in reader:
+ print(" " + ",".join(row))
+
+
+def main():
+ # When running under pytest, track_cost intentionally skips CSV writing.
+ # Clear that env var so this standalone example actually writes the CSV.
+ # IMPORTANT: do this inside main() — NOT at module top-level — so that
+ # `import context.track_cost_example` from a pytest session does not
+ # silently disable track_cost's CSV-skip guard for the rest of the run.
+ os.environ.pop("PYTEST_CURRENT_TEST", None)
+
+ # Work in an isolated temp directory so the example never writes into the
+ # project tree and is fully reproducible.
+ workdir = tempfile.mkdtemp(prefix="track_cost_example_")
+ prompt_path = os.path.join(workdir, "prompt.txt")
+ output_path = os.path.join(workdir, "output.txt")
+ cost_csv = os.path.join(workdir, "cost.csv")
+
+ with open(prompt_path, "w", encoding="utf-8") as fh:
+ fh.write("Write a haiku about Python.")
+
+ runner = CliRunner()
+
+ print("=== Invocation 1: legacy result shape ===")
+ result1 = runner.invoke(
+ cli,
+ [
+ "--output-cost", cost_csv,
+ "generate",
+ "--prompt-file", prompt_path,
+ "--output", output_path,
+ ],
+ catch_exceptions=False,
+ )
+ print(f"exit_code={result1.exit_code}")
+
+ print()
+ print("=== Invocation 2: enriched result shape with attempted_models ===")
+ result2 = runner.invoke(
+ cli,
+ [
+ "--output-cost", cost_csv,
+ "generate-enriched",
+ "--prompt-file", prompt_path,
+ "--output", output_path,
+ ],
+ catch_exceptions=False,
+ )
+ print(f"exit_code={result2.exit_code}")
+
+ print()
+ _print_csv(cost_csv)
+
+ print()
+ print("Done. Notice:")
+ print(" - Columns are in the order: "
+ "timestamp, model, command, cost, input_files, output_files, attempted_models")
+ print(" - Row 1 has an empty attempted_models cell (legacy shape).")
+ print(" - Row 2 records the fallback chain 'claude-opus;claude-sonnet'.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pdd/fix_error_loop.py b/pdd/fix_error_loop.py
index e719f068e..a0c82fce3 100644
--- a/pdd/fix_error_loop.py
+++ b/pdd/fix_error_loop.py
@@ -6,7 +6,7 @@
import json
from datetime import datetime
from pathlib import Path
-from typing import Tuple, Optional
+from typing import List, Tuple, Optional
import requests
from rich import print as rprint
@@ -16,6 +16,7 @@
# Relative import from an internal module.
from .get_language import get_language
from .fix_errors_from_unit_tests import fix_errors_from_unit_tests
+from .llm_invoke import _propagate_attempted_models_to_ctx
from . import DEFAULT_TIME # Import DEFAULT_TIME
from .python_env_detector import detect_host_python_executable
from .agentic_fix import run_agentic_fix
@@ -39,6 +40,24 @@ def escape_brackets(text: str) -> str:
return text.replace("[", "\\[").replace("]", "\\]")
+def _raise_cloud_fix_error(message: str, chain: Optional[List[str]] = None) -> None:
+ """Raise a ``RuntimeError`` with ``attempted_models`` attached.
+
+ Mirrors the pattern from ``_llm_invoke_cloud`` (round 2): when a cloud
+ HTTP attempt fails AFTER the request was dispatched, the chain needs
+ to escape via the exception so the caller can recover it for
+ ``track_cost`` (issue #1086). When the server failed before reporting
+ a model, fall back to the ``cloud:auto`` sentinel.
+ """
+ err = RuntimeError(message)
+ try:
+ err.attempted_models = list(chain) if chain else ['cloud:auto']
+ except Exception:
+ # Some exception subclasses are immutable; best-effort attach only.
+ pass
+ raise err
+
+
def cloud_fix_errors(
unit_test: str,
code: str,
@@ -52,7 +71,7 @@ def cloud_fix_errors(
code_file_ext: str = ".py",
protect_tests: bool = False,
failure_classification: str | None = None,
-) -> Tuple[bool, bool, str, str, str, float, str]:
+) -> Tuple[bool, bool, str, str, str, float, str, List[str]]:
"""
Call the cloud fixCode endpoint to fix errors in code and unit tests.
@@ -81,6 +100,12 @@ def cloud_fix_errors(
- analysis: Analysis/explanation of fixes
- total_cost: Cost of the operation
- model_name: Name of model used
+ - attempted_models: Ordered list of cloud-side model identifiers
+ tried, each prefixed with ``cloud:``. Accepts the camelCase
+ ``attemptedModels``, snake_case ``attempted_models``, and legacy
+ ``modelChain`` response keys per the issue #1086 cloud contract;
+ falls back to ``[model_name]`` with a warning when none are
+ present (non-conforming server degraded mode).
Raises:
RuntimeError: When cloud execution fails with non-recoverable error
@@ -135,13 +160,39 @@ def cloud_fix_errors(
update_unit_test = response_data.get("updateUnitTest", False)
update_code = response_data.get("updateCode", False)
+ # Issue #1086 cloud contract: parse the chronological cloud-side model
+ # chain. Accept the canonical camelCase key plus snake_case and legacy
+ # `modelChain` aliases for backward compatibility, and degrade to a
+ # single-entry chain when the server is non-conforming.
+ raw_chain = (
+ response_data.get("attemptedModels")
+ or response_data.get("attempted_models")
+ or response_data.get("modelChain")
+ )
+ if not raw_chain:
+ console.print(
+ "[yellow]Cloud /fixCode response omitted attemptedModels "
+ "(issue #1086) — degrading to single-entry chain from "
+ "modelName.[/yellow]"
+ )
+ raw_chain = [model_name] if model_name else []
+ elif not isinstance(raw_chain, list):
+ # Non-conforming server may return a scalar string; wrap so the
+ # comprehension below doesn't iterate over characters.
+ raw_chain = [raw_chain]
+ attempted_models = [
+ str(m) if str(m).startswith("cloud:") else f"cloud:{m}"
+ for m in raw_chain
+ if m
+ ]
+
if verbose:
console.print(f"[cyan]Cloud fix completed. Model: {model_name}, Cost: ${total_cost:.6f}[/cyan]")
- return update_unit_test, update_code, fixed_unit_test, fixed_code, analysis, total_cost, model_name
+ return update_unit_test, update_code, fixed_unit_test, fixed_code, analysis, total_cost, model_name, attempted_models
except requests.exceptions.Timeout:
- raise RuntimeError(f"Cloud fix timed out after {get_cloud_timeout()}s")
+ _raise_cloud_fix_error(f"Cloud fix timed out after {get_cloud_timeout()}s")
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code if e.response else 0
@@ -153,24 +204,26 @@ def cloud_fix_errors(
error_data = e.response.json()
current_balance = error_data.get("currentBalance", "unknown")
estimated_cost = error_data.get("estimatedCost", "unknown")
- raise RuntimeError(f"Insufficient credits. Balance: {current_balance}, estimated cost: {estimated_cost}")
+ _raise_cloud_fix_error(
+ f"Insufficient credits. Balance: {current_balance}, estimated cost: {estimated_cost}"
+ )
except json.JSONDecodeError:
- raise RuntimeError(f"Insufficient credits: {err_content}")
+ _raise_cloud_fix_error(f"Insufficient credits: {err_content}")
elif status_code == 401:
- raise RuntimeError(f"Authentication failed: {err_content}")
+ _raise_cloud_fix_error(f"Authentication failed: {err_content}")
elif status_code == 403:
- raise RuntimeError(f"Access denied: {err_content}")
+ _raise_cloud_fix_error(f"Access denied: {err_content}")
elif status_code == 400:
- raise RuntimeError(f"Invalid request: {err_content}")
+ _raise_cloud_fix_error(f"Invalid request: {err_content}")
else:
# 5xx or other errors - raise for caller to handle
- raise RuntimeError(f"Cloud HTTP error ({status_code}): {err_content}")
+ _raise_cloud_fix_error(f"Cloud HTTP error ({status_code}): {err_content}")
except requests.exceptions.RequestException as e:
- raise RuntimeError(f"Cloud network error: {e}")
+ _raise_cloud_fix_error(f"Cloud network error: {e}")
except json.JSONDecodeError:
- raise RuntimeError("Cloud returned invalid JSON response")
+ _raise_cloud_fix_error("Cloud returned invalid JSON response")
# ---------- Normalize any agentic return shape to a 4-tuple ----------
@@ -699,10 +752,24 @@ def fix_error_loop(unit_test_file: str,
# Format the log for the LLM - includes local test results
formatted_log = format_log_for_output(log_structure)
+ # attempted_models is only populated by the cloud path (8-tuple
+ # return); local fix_errors_from_unit_tests returns a 7-tuple and
+ # routes its chain through llm_invoke's own ctx propagation.
+ attempted_models: List[str] = []
+
if use_cloud:
# Use cloud LLM for fix - local test results passed via formatted_log
try:
- updated_unit_test, updated_code, fixed_unit_test, fixed_code, analysis, cost, model_name = cloud_fix_errors(
+ (
+ updated_unit_test,
+ updated_code,
+ fixed_unit_test,
+ fixed_code,
+ analysis,
+ cost,
+ model_name,
+ attempted_models,
+ ) = cloud_fix_errors(
unit_test=unit_test_contents,
code=code_contents,
prompt=prompt,
@@ -716,7 +783,22 @@ def fix_error_loop(unit_test_file: str,
protect_tests=protect_tests,
failure_classification=failure_hint,
)
+ # Surface the cloud-side chain on the active Click ctx
+ # so the @track_cost decorator records it on the fix
+ # command's cost.csv row even when the legacy 6-tuple
+ # return shape of fix_main discards attempted_models.
+ if attempted_models:
+ _propagate_attempted_models_to_ctx(attempted_models)
except RuntimeError as cloud_err:
+ # Recover the cloud attempt from the exception (attached
+ # by _raise_cloud_fix_error on every HTTP-error path) so
+ # @track_cost still records the cloud attempt even on
+ # failed cloud → fallback-local or failed cloud → stop
+ # paths. Issue #1086 contract: cloud attempts MUST land
+ # in cost.csv whether or not the cloud call succeeded.
+ failed_chain = getattr(cloud_err, "attempted_models", None) or []
+ if failed_chain:
+ _propagate_attempted_models_to_ctx(failed_chain)
# Cloud failed - fall back to local if it's a recoverable error
if "Insufficient credits" in str(cloud_err) or "Authentication failed" in str(cloud_err) or "Access denied" in str(cloud_err):
# Non-recoverable errors - stop the loop
diff --git a/pdd/llm_invoke.py b/pdd/llm_invoke.py
index b79625fa7..a71dda98c 100644
--- a/pdd/llm_invoke.py
+++ b/pdd/llm_invoke.py
@@ -528,6 +528,36 @@ def _validate_with_pydantic(
raise ValueError(f"Cannot validate result type {type(result)} with Pydantic model")
+def _propagate_attempted_models_to_ctx(attempted_models: List[str]) -> None:
+ """Best-effort: record the model fallback chain on the current Click
+ context's ``ctx.obj['attempted_models']`` so :mod:`pdd.track_cost` can
+ write it into the per-command cost CSV. No-op when not inside a Click
+ invocation or when Click isn't importable.
+ """
+ if not attempted_models:
+ return
+ try:
+ import click # Lazy import to avoid hard dependency at module load
+ ctx = click.get_current_context(silent=True)
+ if ctx is None or ctx.obj is None or not hasattr(ctx.obj, "__setitem__"):
+ return
+ # Pure concatenation across invocation boundaries. The contract is
+ # "called at most once per llm_invoke", and each llm_invoke already
+ # pre-deduplicates its own chain before propagation, so the boundary
+ # is a discrete chronological event. Two sibling llm_invoke calls
+ # that reuse the same model (e.g. both report ['gpt-4']) must
+ # combine to ['gpt-4','gpt-4'] to preserve real call history —
+ # de-duplicating at the boundary would silently drop LLM calls.
+ existing = ctx.obj.get("attempted_models") or []
+ if not isinstance(existing, list):
+ existing = []
+ incoming = [str(name) for name in attempted_models if name]
+ ctx.obj["attempted_models"] = list(existing) + incoming
+ except Exception:
+ # Cost tracking is best-effort; never raise from this helper.
+ pass
+
+
def _llm_invoke_cloud(
prompt: Optional[str],
input_json: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]],
@@ -603,6 +633,14 @@ def _llm_invoke_cloud(
if verbose:
logger.debug(f"Cloud llm_invoke request to: {cloud_url}")
+ def _with_cloud_attempt(exc: Exception) -> Exception:
+ """Attach the cloud sentinel only after the HTTP boundary is reached."""
+ try:
+ exc.attempted_models = ["cloud:auto"]
+ except Exception:
+ pass
+ return exc
+
try:
response = requests.post(
cloud_url,
@@ -638,16 +676,41 @@ def _llm_invoke_cloud(
# Return raw result if validation fails
pass
+ # The cloud /llmInvoke contract REQUIRES `attemptedModels` so the
+ # local cost CSV can record any abandoned-then-retried chain that
+ # happened server-side (issue #1086). Accept the camelCase,
+ # snake_case, and legacy `modelChain` keys for backward
+ # compatibility with older / non-conforming servers; when none of
+ # those are present log a warning and degrade to `[modelName]` —
+ # a single-entry chain that loses any cloud-side fallback history.
+ cloud_attempted = (
+ data.get("attemptedModels")
+ or data.get("attempted_models")
+ or data.get("modelChain")
+ )
+ if cloud_attempted and isinstance(cloud_attempted, list):
+ attempted_models_out = [str(m) for m in cloud_attempted if m]
+ else:
+ logger.warning(
+ "Cloud /llmInvoke success response omitted required field "
+ "'attemptedModels' (issue #1086). Falling back to "
+ "[modelName] — abandoned cloud-side model attempts will "
+ "be lost from cost tracking."
+ )
+ model_name_out = data.get("modelName", "cloud_model")
+ attempted_models_out = [str(model_name_out)] if model_name_out else []
+
return {
"result": result,
"cost": data.get("totalCost", 0.0),
"model_name": data.get("modelName", "cloud_model"),
"thinking_output": data.get("thinkingOutput"),
+ "attempted_models": attempted_models_out,
}
elif response.status_code == 402:
error_msg = response.json().get("error", "Insufficient credits")
- raise InsufficientCreditsError(error_msg)
+ raise _with_cloud_attempt(InsufficientCreditsError(error_msg))
elif response.status_code in (401, 403):
# Clear stale JWT cache to prevent repeated failures
@@ -662,22 +725,22 @@ def _llm_invoke_cloud(
f"Authentication expired ({server_error or response.status_code}). "
"Please re-authenticate with: pdd auth logout && pdd auth login"
)
- raise CloudFallbackError(error_msg)
+ raise _with_cloud_attempt(CloudFallbackError(error_msg))
elif response.status_code >= 500:
error_msg = response.json().get("error", f"Server error ({response.status_code})")
- raise CloudFallbackError(error_msg)
+ raise _with_cloud_attempt(CloudFallbackError(error_msg))
else:
error_msg = response.json().get("error", f"HTTP {response.status_code}")
- raise CloudInvocationError(f"Cloud llm_invoke failed: {error_msg}")
+ raise _with_cloud_attempt(CloudInvocationError(f"Cloud llm_invoke failed: {error_msg}"))
- except requests.exceptions.Timeout:
- raise CloudFallbackError("Cloud request timed out")
+ except requests.exceptions.Timeout as e:
+ raise _with_cloud_attempt(CloudFallbackError("Cloud request timed out")) from e
except requests.exceptions.ConnectionError as e:
- raise CloudFallbackError(f"Cloud connection failed: {e}")
+ raise _with_cloud_attempt(CloudFallbackError(f"Cloud connection failed: {e}")) from e
except requests.exceptions.RequestException as e:
- raise CloudFallbackError(f"Cloud request failed: {e}")
+ raise _with_cloud_attempt(CloudFallbackError(f"Cloud request failed: {e}")) from e
def _is_wsl_environment() -> bool:
@@ -2980,6 +3043,12 @@ def llm_invoke(
},
)
+ # Chronological chain of every model PDD attempts during this invocation.
+ # Cloud attempts are merged from _llm_invoke_cloud only after that helper
+ # reaches the HTTP boundary; pre-request auth/setup failures are not model
+ # attempts and must not pollute the chain.
+ attempted_models_chain: List[str] = []
+
if use_cloud:
from rich.console import Console
console = Console()
@@ -2989,7 +3058,7 @@ def llm_invoke(
try:
_emit_llm_attribution(attribution_context, "llm_invoke.cloud_dispatch")
- return _llm_invoke_cloud(
+ cloud_result = _llm_invoke_cloud(
prompt=prompt,
input_json=input_json,
strength=strength,
@@ -3002,8 +3071,32 @@ def llm_invoke(
messages=messages,
language=language,
)
+ # Prefix whatever the cloud actually reported so callers can tell
+ # cloud and local attempts apart in the cost CSV.
+ cloud_reported = cloud_result.get("attempted_models") if isinstance(cloud_result, dict) else None
+ if cloud_reported:
+ attempted_models_chain = []
+ for entry in cloud_reported:
+ text = str(entry)
+ if not text:
+ continue
+ if not text.startswith("cloud:"):
+ text = f"cloud:{text}"
+ if not attempted_models_chain or attempted_models_chain[-1] != text:
+ attempted_models_chain.append(text)
+ elif isinstance(cloud_result, dict) and cloud_result.get("model_name"):
+ attempted_models_chain = [f"cloud:{cloud_result['model_name']}"]
+ if isinstance(cloud_result, dict):
+ cloud_result["attempted_models"] = list(attempted_models_chain)
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
+ return cloud_result
except CloudFallbackError as e:
- # Notify user and fall back to local execution
+ # Notify user and fall back to local execution.
+ # NOTE: do NOT call _propagate_attempted_models_to_ctx here.
+ # The "at most once per llm_invoke" contract requires that
+ # propagation happens exactly once — either inside the pre-loop
+ # setup try/except wrapper (on setup failure) or after the local
+ # candidate loop finishes (on success / all-candidates RuntimeError).
console.print(f"[yellow]Cloud execution failed ({e}), falling back to local execution...[/yellow]")
logger.warning(f"Cloud fallback: {e}")
_emit_llm_attribution(
@@ -3011,13 +3104,29 @@ def llm_invoke(
"llm_invoke.cloud_fallback",
**_safe_error_fields(e),
)
- # Continue to local execution below
- except InsufficientCreditsError:
- # Re-raise credit errors - user needs to know
+ for name in getattr(e, "attempted_models", []) or []:
+ text = str(name)
+ if text and (not attempted_models_chain or attempted_models_chain[-1] != text):
+ attempted_models_chain.append(text)
+ except InsufficientCreditsError as e:
+ # Re-raise credit errors - user needs to know. Attach the
+ # cloud attempt so callers / track_cost can still record it.
_emit_llm_attribution(attribution_context, "llm_invoke.cloud_insufficient_credits")
+ for name in getattr(e, "attempted_models", []) or []:
+ text = str(name)
+ if text and (not attempted_models_chain or attempted_models_chain[-1] != text):
+ attempted_models_chain.append(text)
+ try:
+ e.attempted_models = list(attempted_models_chain)
+ except Exception:
+ pass
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
raise
except CloudInvocationError as e:
- # Non-recoverable cloud error - notify and fall back
+ # Non-recoverable cloud error - notify and fall back to local.
+ # NOTE: do NOT call _propagate_attempted_models_to_ctx here for the
+ # same "at most once per llm_invoke" reason documented above on
+ # CloudFallbackError.
console.print(f"[yellow]Cloud error ({e}), falling back to local execution...[/yellow]")
logger.warning(f"Cloud invocation error: {e}")
_emit_llm_attribution(
@@ -3025,7 +3134,10 @@ def llm_invoke(
"llm_invoke.cloud_error",
**_safe_error_fields(e),
)
- # Continue to local execution below
+ for name in getattr(e, "attempted_models", []) or []:
+ text = str(name)
+ if text and (not attempted_models_chain or attempted_models_chain[-1] != text):
+ attempted_models_chain.append(text)
# --- 2. Local execution uses already-validated formatted_messages ---
@@ -3045,24 +3157,44 @@ def llm_invoke(
if time is None:
time = 0.0
- if not (0.0 <= strength <= 1.0):
- raise ValueError("'strength' must be between 0.0 and 1.0.")
- if not (0.0 <= temperature <= 2.0): # Common range for temperature
- warnings.warn("'temperature' is outside the typical range (0.0-2.0).")
- if not (0.0 <= time <= 1.0):
- raise ValueError("'time' must be between 0.0 and 1.0.")
-
- # --- 2. Load Model Data & Select Candidates ---
+ # Wrap pre-candidate-loop setup so any failure (parameter validation,
+ # model CSV load, candidate selection) still surfaces the cloud attempts
+ # already merged into attempted_models_chain. Without this, e.g. a
+ # missing/corrupt llm_model.csv after a CloudFallbackError would escape
+ # naked and track_cost would lose the cloud entry.
try:
- model_df = _load_model_data(LLM_MODEL_CSV_PATH)
- candidate_models = _select_model_candidates(strength, DEFAULT_BASE_MODEL, model_df)
- except (FileNotFoundError, ValueError, RuntimeError) as e:
- logger.error(f"Failed during model loading or selection: {e}")
- _emit_llm_attribution(
- attribution_context,
- "llm_invoke.model_selection_error",
- **_safe_error_fields(e),
- )
+ if not (0.0 <= strength <= 1.0):
+ raise ValueError("'strength' must be between 0.0 and 1.0.")
+ if not (0.0 <= temperature <= 2.0): # Common range for temperature
+ warnings.warn("'temperature' is outside the typical range (0.0-2.0).")
+ if not (0.0 <= time <= 1.0):
+ raise ValueError("'time' must be between 0.0 and 1.0.")
+
+ # --- 2. Load Model Data & Select Candidates ---
+ try:
+ model_df = _load_model_data(LLM_MODEL_CSV_PATH)
+ candidate_models = _select_model_candidates(strength, DEFAULT_BASE_MODEL, model_df)
+ except (FileNotFoundError, ValueError, RuntimeError) as e:
+ logger.error(f"Failed during model loading or selection: {e}")
+ _emit_llm_attribution(
+ attribution_context,
+ "llm_invoke.model_selection_error",
+ **_safe_error_fields(e),
+ )
+ raise
+ except Exception as e:
+ try:
+ e.attempted_models = list(attempted_models_chain)
+ except Exception:
+ # Some exception types are immutable (e.g. some C extension errors);
+ # best-effort attach only.
+ pass
+ # Propagate to ctx here so track_cost can recover the cloud chain
+ # even if the exception object lost the attribute. This is the only
+ # propagation site for pre-loop setup failures (the cloud-error
+ # handlers themselves no longer propagate, to preserve the "at most
+ # once per llm_invoke" invariant).
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
raise
_emit_llm_attribution(
@@ -3131,7 +3263,14 @@ def calc_strength(candidate):
# --- 3. Iterate Through Candidates and Invoke LLM ---
last_exception = None
newly_acquired_keys: Dict[str, bool] = {} # Track keys obtained in this run
-
+ # attempted_models_chain is initialized above the cloud try block so any
+ # cloud attempts already recorded are preserved when execution falls
+ # through to local. Each local candidate is appended exactly once when
+ # we actually attempt a provider call (i.e., after _ensure_api_key
+ # succeeds); the chain is returned with the result and attached to
+ # exceptions on total failure so callers / cost tracking can see the
+ # full fallback chain — including the final successful model.
+
# Initialize variables for retry section
response_format = None
time_kwargs = {}
@@ -3164,7 +3303,10 @@ def calc_strength(candidate):
# --- 4. API Key Check & Acquisition ---
if not _ensure_api_key(model_info, newly_acquired_keys, verbose):
- # Problem getting key, break inner loop, try next model candidate
+ # Problem getting key, break inner loop, try next model candidate.
+ # NOTE: do NOT record this candidate in attempted_models_chain
+ # because no provider call was made — the prompt contract
+ # requires only attempted calls to appear in the chain.
_emit_llm_attribution(
attribution_context,
"llm_invoke.model_skipped",
@@ -3178,6 +3320,14 @@ def calc_strength(candidate):
logger.info(f"[SKIP] Skipping {model_name_litellm} due to API key/credentials issue after prompt.")
break # Breaks the 'while retry_with_same_model' loop
+ # NOTE: The candidate is NOT recorded here. The prompt contract
+ # (llm_invoke_python.prompt: "Attempted-Model Tracking") requires
+ # the append to happen just before the actual provider call so
+ # context-window pre-validation rejections (and any other
+ # pre-call skips) do NOT pollute the chain. The append is
+ # performed immediately before each litellm.responses /
+ # litellm.batch_completion / litellm.completion invocation below.
+
# --- 5. Prepare LiteLLM Arguments ---
litellm_kwargs: Dict[str, Any] = {
"model": model_name_litellm,
@@ -3577,6 +3727,10 @@ def calc_strength(candidate):
"has_structured_text_format": text_block.get("format", {}).get("type") == "json_schema",
},
)
+ # Record candidate in chronological chain just before the
+ # actual provider call. Dedup against same-model cache-bypass retries.
+ if model_name_litellm and (not attempted_models_chain or attempted_models_chain[-1] != model_name_litellm):
+ attempted_models_chain.append(str(model_name_litellm))
resp = litellm.responses(**responses_kwargs)
call_duration = time_module.time() - call_start
@@ -3679,12 +3833,14 @@ def calc_strength(candidate):
finish_reason=finish_reason,
call_type="responses",
)
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
return {
'result': final_result,
'cost': total_cost,
'model_name': model_name_litellm,
'thinking_output': None,
'finish_reason': finish_reason,
+ 'attempted_models': list(attempted_models_chain),
}
except Exception as e:
last_exception = e
@@ -3776,6 +3932,11 @@ def calc_strength(candidate):
context_limit=context_limit_for_attribution,
)
+ # Record candidate in chronological chain just before the
+ # actual provider call. Dedup against same-model cache-bypass retries.
+ if model_name_litellm and (not attempted_models_chain or attempted_models_chain[-1] != model_name_litellm):
+ attempted_models_chain.append(str(model_name_litellm))
+
if use_batch_mode:
if verbose:
logger.info(f"[INFO] Calling litellm.batch_completion for {model_name_litellm}...")
@@ -4344,12 +4505,20 @@ def calc_strength(candidate):
finish_reason=_LAST_CALLBACK_DATA.get("finish_reason"),
call_type=call_type_for_attribution,
)
+ # Surface the chain on the active Click context so the
+ # track_cost decorator records it even when callers reshape
+ # the return value into a tuple without the dict.
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
return {
'result': final_result,
'cost': total_cost,
'model_name': model_name_litellm, # Actual model used
'thinking_output': final_thinking if final_thinking else None,
'finish_reason': _LAST_CALLBACK_DATA.get("finish_reason"),
+ # Chronological chain of every model PDD attempted, including
+ # any earlier candidates that failed and were abandoned and
+ # the final successful model. Consumed by track_cost.
+ 'attempted_models': list(attempted_models_chain),
}
# --- 6b. Handle Invocation Errors ---
@@ -4499,7 +4668,16 @@ def calc_strength(candidate):
failure_reason="all_candidate_models_failed",
last_error_type=type(last_exception).__name__ if last_exception else None,
)
- raise RuntimeError(error_message) from last_exception
+ # Surface the chain on the active Click context so the track_cost
+ # decorator records the abandoned attempts even on total failure, and
+ # attach it to the raised exception so direct callers can introspect.
+ _propagate_attempted_models_to_ctx(attempted_models_chain)
+ exhaustion_error = RuntimeError(error_message)
+ try:
+ exhaustion_error.attempted_models = list(attempted_models_chain)
+ except Exception:
+ pass
+ raise exhaustion_error from last_exception
# --- Example Usage (Optional) ---
if __name__ == "__main__":
diff --git a/pdd/prompts/fix_error_loop_python.prompt b/pdd/prompts/fix_error_loop_python.prompt
index 94d0e87f2..ac9ae3f41 100644
--- a/pdd/prompts/fix_error_loop_python.prompt
+++ b/pdd/prompts/fix_error_loop_python.prompt
@@ -5,7 +5,7 @@
"type": "module",
"module": {
"functions": [
- {"name": "cloud_fix_errors", "signature": "(unit_test, code, prompt, error, error_file, strength, temperature, verbose=False, time=DEFAULT_TIME, code_file_ext='.py', protect_tests=False, failure_classification=None)", "returns": "Tuple[bool, bool, str, str, str, float, str]"},
+ {"name": "cloud_fix_errors", "signature": "(unit_test, code, prompt, error, error_file, strength, temperature, verbose=False, time=DEFAULT_TIME, code_file_ext='.py', protect_tests=False, failure_classification=None)", "returns": "Tuple[bool, bool, str, str, str, float, str, List[str]]"},
{"name": "fix_error_loop", "signature": "(unit_test_file, code_file, prompt_file, prompt, verification_program, strength, temperature, max_attempts, budget, error_log_file='error_log.txt', verbose=False, time=DEFAULT_TIME, agentic_fallback=True, protect_tests=False, use_cloud=False, test_files=None, failure_aware_retries=True)", "returns": "Tuple[bool, str, str, int, float, str]"}
]
}
@@ -36,7 +36,7 @@ The module supports both local and cloud execution modes:
Ensure the `error_log_file` is populated with the current state before the agent is called.
7. **Non-Python Support:** Detect non-Python files by extension. Use `get_test_command_for_file` to run initial verification. If it fails, skip the standard pytest loop and go directly to the agentic fallback.
8. **Cost & Budget Management:** Accumulate `total_cost` from LLM calls and stop the loop immediately if the `budget` is exceeded.
-9. **Cloud Execution Support:** Implement `cloud_fix_errors` function that calls the cloud fixCode endpoint with the same interface as `fix_errors_from_unit_tests`. This enables hybrid mode where LLM calls go to the cloud while test execution stays local. When passing failure context to the cloud, **prepend** a `[PDD failure classification] ...` line (or equivalent prefix) to the `errors` string payload so the remote fix step sees the same classification as local `fix_errors_from_unit_tests`. `protect_tests` should be forwarded in the payload (as `protectTests`) and accepted in the `cloud_fix_errors` signature. The hosted `fixCode` backend does not yet read this field, so it is a forward-looking no-op in cloud mode — the flag is fully honored in local mode. Keep the param so the contract matches local `fix_errors_from_unit_tests` and flips on when the backend adds support.
+9. **Cloud Execution Support:** Implement `cloud_fix_errors` function that calls the cloud fixCode endpoint with the same interface as `fix_errors_from_unit_tests`. This enables hybrid mode where LLM calls go to the cloud while test execution stays local. When passing failure context to the cloud, **prepend** a `[PDD failure classification] ...` line (or equivalent prefix) to the `errors` string payload so the remote fix step sees the same classification as local `fix_errors_from_unit_tests`. `protect_tests` should be forwarded in the payload (as `protectTests`) and accepted in the `cloud_fix_errors` signature. The hosted `fixCode` backend does not yet read this field, so it is a forward-looking no-op in cloud mode — the flag is fully honored in local mode. Keep the param so the contract matches local `fix_errors_from_unit_tests` and flips on when the backend adds support. Parse the cloud-side attempted-model chain from the response per the issue #1086 cloud contract: accept the canonical camelCase `attemptedModels`, the snake_case `attempted_models`, and the legacy `modelChain` aliases (in that order). When the response omits all three keys, emit a yellow Rich warning ("Cloud /fixCode response omitted attemptedModels — degrading to single-entry chain from modelName") and fall back to `[modelName]`. If the parsed value is non-empty but not a list (a non-conforming server returning a scalar string `"gpt-4"` instead of `["gpt-4"]`), wrap it in a single-element list BEFORE building the prefixed chain — otherwise the comprehension iterates over characters and records `['cloud:g','cloud:p','cloud:t','cloud:-','cloud:4']` in the cost CSV. Each chain entry MUST be prefixed with `cloud:` to distinguish cloud attempts from local attempts in the unified cost CSV. Return an 8-tuple `(update_unit_test, update_code, fixed_unit_test, fixed_code, analysis, total_cost, model_name, attempted_models)`. In `fix_error_loop`, after unpacking the 8-tuple from `cloud_fix_errors`, best-effort call `_propagate_attempted_models_to_ctx(attempted_models)` from `pdd.llm_invoke` so the `@track_cost` decorator on the outer `fix` command records the full cloud chain even though `fix_main` and `fix_error_loop` themselves still return legacy 6-tuples that discard the chain. Every error path in `cloud_fix_errors` (`requests.exceptions.Timeout`, `HTTPError` 4xx/5xx including the 402/401/403/400 specific branches, `RequestException`, and `JSONDecodeError`) MUST attach `attempted_models=['cloud:auto']` to the raised `RuntimeError` via best-effort `exc.attempted_models = ['cloud:auto']` — the `cloud:auto` sentinel matches `_llm_invoke_cloud`'s pattern for failures where the server did not report a specific model. Centralize this in a `_raise_cloud_fix_error(message, chain=None)` helper so every error branch goes through one site and the attach can never be forgotten. The caller in `fix_error_loop` MUST wrap the `cloud_fix_errors` call in `try/except RuntimeError as e:`, harvest `getattr(e, 'attempted_models', [])`, and `_propagate_attempted_models_to_ctx(failed_chain)` BEFORE either re-raising or falling back to local — without this, the issue #1086 contract is violated on failed cloud runs because `track_cost` records only the later local model (or stays empty) even though a cloud attempt actually happened.
10. **Output Contract:** Return a 6-tuple: `(success, final_unit_test, final_code, total_attempts, total_cost, model_name)`.
* *Note:* If checks pass initially, return actual file contents, `0` for `total_attempts`, `0.0` for `total_cost`, and the same model-name convention as implementation (`""` for Python initial pass; `"N/A"` for non-Python initial pass).
11. **Result Normalization:** Implement `_normalize_agentic_result` to handle various return shapes (2, 3, 4, or 5 elements) from the agentic fix to ensure a consistent internal state, specifically extracting `changed_files` if available.
@@ -66,7 +66,7 @@ def cloud_fix_errors(
code_file_ext: str = ".py",
protect_tests: bool = False,
failure_classification: str | None = None,
-) -> Tuple[bool, bool, str, str, str, float, str]:
+) -> Tuple[bool, bool, str, str, str, float, str, List[str]]:
"""
Call the cloud fixCode endpoint to fix errors in code and unit tests.
@@ -77,10 +77,19 @@ def cloud_fix_errors(
protect_tests: When True, prevents the LLM from modifying test files.
failure_classification: Optional classification string to send to cloud.
- Returns: (update_unit_test, update_code, fixed_unit_test, fixed_code, analysis, total_cost, model_name)
+ Returns: (update_unit_test, update_code, fixed_unit_test, fixed_code, analysis, total_cost, model_name, attempted_models)
+ attempted_models is the chronological list of cloud-side model
+ identifiers tried (each prefixed with `cloud:`), parsed from the
+ response's `attemptedModels` / `attempted_models` / legacy
+ `modelChain` keys. Falls back to `[cloud:]` when none
+ are present (non-conforming server) and to `[cloud:auto]` on
+ cloud-error RuntimeError paths.
Raises:
- RuntimeError: When cloud execution fails with non-recoverable error
+ RuntimeError: When cloud execution fails with non-recoverable error.
+ Every error path attaches `attempted_models=['cloud:auto']` to
+ the raised exception so callers can recover the cloud attempt
+ for track_cost (issue #1086).
"""
def fix_error_loop(
diff --git a/pdd/prompts/llm_invoke_python.prompt b/pdd/prompts/llm_invoke_python.prompt
index 55cbfafcf..0598b0a2b 100644
--- a/pdd/prompts/llm_invoke_python.prompt
+++ b/pdd/prompts/llm_invoke_python.prompt
@@ -104,6 +104,18 @@
- 'model_name': Name of the selected model.
- 'thinking_output': Reasoning output if available.
- 'finish_reason': Provider completion reason when available. For litellm.completion/batch_completion, use the LiteLLM callback's `completion_response.choices[0].finish_reason`; for OpenAI Responses API, use the response status (for example "completed"). Return None when unavailable.
+ - 'attempted_models': Ordered list of every model name attempted during this invocation, in chronological order from first attempt to last. The final (successful) model is always the last element. Models that failed and were abandoned (provider auth errors, provider-side context overflow, schema validation failure, provider errors that triggered fallback) appear earlier in the list. The list contains a single element when no fallback occurred. Successful OpenAI Responses API calls MUST include this key just like litellm.completion/batch_completion calls. See "Attempted-Model Tracking" below for the exact recording rules.
+
+% Attempted-Model Tracking:
+ - Maintain an ordered list of attempted model names across the per-model candidate loop and across local↔cloud transitions within a single `llm_invoke` call.
+ - Append each model name the first time it is actually attempted (i.e., just before the `litellm.completion`/`batch_completion`/`responses` call, or the cloud HTTP call). Do NOT append for models that are skipped before any call (e.g., missing api_key, context-window pre-validation rejection). Record only models for which an actual provider call was attempted.
+ - Do NOT duplicate the same model on consecutive same-model cache-bypass retries (None content, malformed JSON, invalid Python). Same-model retries count as one attempt — only record the next *distinct* model if fallback escalates.
+ - For cloud execution: when use_cloud resolves to True, append the cloud sentinel `cloud:` (where `` is the model the cloud endpoint reports in its response, or the literal string `cloud:auto` if the cloud HTTP request fails before a response model is known) only at the point the cloud HTTP call is attempted. Do NOT record pre-request cloud setup/authentication failures, such as a missing JWT token. On CloudFallbackError / CloudInvocationError raised at or after the HTTP boundary, the local path that follows appends its own model attempts after the cloud entry, preserving the full chronological chain.
+ - Always include the final successful model as the last list element.
+ - When all candidates are exhausted and `llm_invoke` raises `RuntimeError`, attach the chain to the exception as an `attempted_models` attribute (e.g., `exc.attempted_models = [...]`) so callers / decorators (`track_cost`) can still surface the abandoned chain. Do this for `RuntimeError`, `SchemaValidationError`, and `InsufficientCreditsError` raised by `llm_invoke`.
+ - Propagation: `llm_invoke` MAY best-effort copy a non-empty `attempted_models` chain onto `click.get_current_context().obj['attempted_models']` so the `track_cost` decorator can record the full chain even when the command itself returns a legacy `(..., cost, model_name)` tuple shape. This propagation must be non-fatal when Click is unavailable or no current context exists; the exact merge semantics (pure concatenation, no boundary dedup) are specified by the "called at most once" rule below.
+ - Cloud→local handoff durability: wrap the pre-candidate-loop setup that runs after the `CloudFallbackError` / `CloudInvocationError` handlers — parameter validation (`strength`/`temperature`/`time` range checks), `_load_model_data`, and `_select_model_candidates` — in a `try/except Exception as e:` block that best-effort attaches `e.attempted_models = list(attempted_models_chain)` (suppress secondary errors when the exception object is immutable), best-effort calls `_propagate_attempted_models_to_ctx(attempted_models_chain)` so the ctx-side record survives in addition to the exception attribute, and then re-raises with a bare `raise`. Without this guard, any pre-loop failure between the cloud handlers and the candidate loop escapes naked, and `track_cost` records the failed command without the cloud attempts that already happened. Inside the `CloudFallbackError` and `CloudInvocationError` handlers themselves, MERGE the cloud-recorded attempts into the local `attempted_models_chain` but do NOT call `_propagate_attempted_models_to_ctx` directly there — see the "called at most once" rule below.
+ - `_propagate_attempted_models_to_ctx` called AT MOST ONCE per `llm_invoke` invocation. The helper composes the incoming chain with any pre-existing `ctx.obj['attempted_models']` via pure concatenation (`existing + incoming`). To preserve the "at most once" invariant the helper MUST be called at most once per `llm_invoke`: on final cloud success (right before `return cloud_result`), on final local success (after the candidate loop selects a result), inside the pre-loop setup `try/except Exception as e:` wrapper, and inside the `InsufficientCreditsError` handler (which raises immediately and never reaches the pre-loop wrapper). The `CloudFallbackError` / `CloudInvocationError` handlers MUST NOT call the helper directly; they fall through to either local success (one call) or the pre-loop wrapper (one call). The helper performs pure concatenation across invocation boundaries — it does NOT dedup the first element of `incoming` against the last element of `existing`. Two sibling `llm_invoke` calls that reuse the same model (e.g., both report `['gpt-4']`) must produce a combined chain of `['gpt-4','gpt-4']` on `ctx.obj['attempted_models']` to preserve chronological history. Each individual `llm_invoke` chain is already pre-deduplicated within the invocation by the chain-building code (the `not attempted_models_chain or attempted_models_chain[-1] != text` checks); collapsing further at the boundary would silently drop real LLM calls.
% Model Selection:
- Base model: `$PDD_MODEL_DEFAULT` env var, or None (first available model in CSV used as surrogate).
@@ -235,7 +247,7 @@
- On InsufficientCreditsError: Re-raise (no fallback).
- On CloudInvocationError: Print warning, fall back to local.
- Cloud request payload: prompt, inputJson, messages, strength, temperature, time, verbose, outputSchema, useBatchMode, language.
- - Cloud response: {result, totalCost, modelName, thinkingOutput}.
+ - Cloud response: `{result, totalCost, modelName, thinkingOutput, attemptedModels}`. `attemptedModels` is a REQUIRED field of the cloud `/llmInvoke` success contract (issue #1086) — the server MUST return the full chronological list of cloud-side model identifiers tried (final model last) so the local cost CSV can record the abandoned-then-retried chain that happened server-side. The client SHOULD log a warning when a cloud success response omits `attemptedModels` (and the snake_case `attempted_models` / legacy `modelChain` aliases). Callers MUST accept the camelCase `attemptedModels`, the snake_case `attempted_models`, and the legacy `modelChain` keys for backward compatibility with older / non-conforming servers; when none of those are present the client falls back to `[modelName]` — a single-entry chain that loses any cloud-side fallback history, which is the documented degraded mode for non-conforming servers, not a permitted server behavior. Returned dict shape is `{result, cost, model_name, thinking_output, attempted_models}` where `attempted_models` is the list described above.
% Cloud API Documentation:
diff --git a/pdd/prompts/summarize_directory_python.prompt b/pdd/prompts/summarize_directory_python.prompt
index 87dab1a96..9904ba292 100644
--- a/pdd/prompts/summarize_directory_python.prompt
+++ b/pdd/prompts/summarize_directory_python.prompt
@@ -40,7 +40,7 @@ Scans files in a directory, summarizes each using an LLM, and produces a CSV wit
9. Summarize each file via `llm_invoke` with Pydantic `FileSummary` model (`file_summary: str`, `key_exports: list[str]`, `dependencies: list[str]`) and `input_json={"file_contents": content}`
10. Error resilience: on file read or LLM failure, record error in `file_summary`, set `content_hash` to `"error"`, continue processing
11. Progress: if `progress_callback` provided, call `progress_callback(current, total)` per file; else if `verbose`, use Rich `Progress` and print per-file summaries for freshly summarized files only
-12. When `max_workers` > 1, parallelize LLM summarization calls using a thread pool. Cache lookups remain sequential; only LLM invocations are parallelized. Accumulate `total_cost` in a thread-safe manner.
+12. When `max_workers` > 1, parallelize LLM summarization calls using a thread pool. Cache lookups remain sequential; only LLM invocations are parallelized. Accumulate `total_cost` in a thread-safe manner. Worker threads' Click context is thread-local — `llm_invoke`'s in-helper `_propagate_attempted_models_to_ctx` call from inside a worker cannot reach the parent command's ctx. Each worker MUST therefore return the `llm_result.get('attempted_models', [])` chain alongside its `(cost, model_name)` so the orchestrator can aggregate chains across all files (in submission-index order, not completion order) and call `_propagate_attempted_models_to_ctx(aggregated_chain)` ONCE on the main thread after the executor shuts down. The aggregation step is best-effort — `_propagate_attempted_models_to_ctx` is a no-op when no Click context is active. The SERIAL paths (`max_workers == 1`) MUST NOT aggregate-and-propagate — `llm_invoke`'s own ctx propagation already lands the chain on `ctx.obj['attempted_models']` directly from the main thread, and a second propagation from the orchestrator would record each entry twice (e.g. `'m1;m1'` for a single LLM call). Gate the main-thread `_propagate_attempted_models_to_ctx` call on `max_workers > 1`. Inside `_process_single_file_logic`'s broad `except Exception as e:` block (which records an error row for failed files), harvest `getattr(e, 'attempted_models', None)` and merge into the local `attempted_models` list using the same per-element dedup as the chain-building loop (`combined[-1] != name`). This is the only way a worker thread can surface llm_invoke's exception-attached chain to the main-thread aggregator — the parent ctx is unreachable from the worker.
13. Incremental flushing: when `csv_path` provided, flush complete CSV to disk after each file for interrupt safety
14. After scanning, merge back any existing CSV entries that were not part of the current scan, so summaries from other directories are not silently dropped. Track scanned paths using normpath, absolute, and realpath forms for symlink consistency.
diff --git a/pdd/prompts/track_cost_python.prompt b/pdd/prompts/track_cost_python.prompt
index 1106103a3..e6f173c93 100644
--- a/pdd/prompts/track_cost_python.prompt
+++ b/pdd/prompts/track_cost_python.prompt
@@ -1,3 +1,16 @@
+Click decorator that tracks per-command LLM cost, model used, and full attempted-model fallback chain to a CSV.
+
+
+{
+ "type": "module",
+ "module": {
+ "functions": [
+ {"name": "track_cost", "signature": "(func)", "returns": "Callable"}
+ ]
+ }
+}
+
+
% You are an expert Python engineer. Your goal is to write a Python decorator function called `track_cost` that will track the cost of each command execution in the "pdd" command-line program. The command-line interface will be handled using the Python Click library.
% Here are the inputs and outputs of the function:
@@ -6,22 +19,30 @@
% The decorator should perform the following steps:
1. **Record the Start Time:** When the command function is called, record the current datetime as the start time.
-2. **Execute the Command Function:** Call the original command function and capture its result.
-3. **Record the End Time:** After the command completes, record the current datetime as the end time.
-4. **Access Command Context:** Retrieve the Click context using `click.get_current_context()`.
-5. **Retrieve Output Cost Option:** Check if the output cost option is specified directly via `ctx.obj` or via the `PDD_OUTPUT_COST_PATH` environment variable.
-6. **Prepare Cost Data:**
+2. **Scope `ctx.obj['attempted_models']` to this single invocation (issue #1086 regression guard):** Before calling `func`, retrieve the Click context with `click.get_current_context()`. If `ctx.obj` is a dict-like that supports `__setitem__`/`get`, snapshot any existing value of `ctx.obj['attempted_models']` (and whether the key was present at all) into local variables, then reset `ctx.obj['attempted_models']` to a fresh empty list. This guarantees that the chain `llm_invoke` aggregates on the shared `ctx.obj` during `func` belongs only to this `@track_cost` invocation, even when multiple `@track_cost`-decorated commands share the same `ctx.obj` in a chained / batched Click run. The reset must be best-effort: if `ctx` is `None`, `ctx.obj` is `None`, or the snapshot/reset raises, fall through to the unscoped behavior without aborting `func`.
+3. **Execute the Command Function:** Call the original command function and capture its result. Wrap the call in `try/except/finally` so the snapshot is always restored (next step) even when `func` raises.
+4. **Record the End Time:** After the command completes (success or exception), record the current datetime as the end time.
+5. **Capture the scoped chain and restore the snapshot:** Inside the `finally` block — before any CSV write logic — read whatever `llm_invoke` accumulated on `ctx.obj['attempted_models']` during `func` into a local list (`command_attempted_models`). Capture from `ctx.obj['attempted_models']` whether or not the snapshot/restore guard fired in step 2 — `ctx.obj` may have been `None` at command entry and created inside `func` (e.g. via `ctx.ensure_object(dict)`), so a missing snapshot is NOT evidence that no chain was recorded. The capture step only requires `ctx.obj is not None` and `hasattr(ctx.obj, 'get')` at finally-time. Then restore the previous shared value (still gated on `attempted_models_scoped`): if a prior value was present in the snapshot, write it back to `ctx.obj['attempted_models']`; if no prior value was present, `pop`/`del` the `'attempted_models'` key entirely so the next sibling command starts with a clean slate. Restoration must also be best-effort and must not raise.
+6. **Access Command Context for CSV write:** Continue to use the Click context obtained in step 2.
+7. **Retrieve Output Cost Option:** Check if the output cost option is specified directly via `ctx.obj` or via the `PDD_OUTPUT_COST_PATH` environment variable.
+8. **Prepare Cost Data:**
- Determine the command name from the context.
-- Extract the cost and model name from the command function's result if available. Generally, the second to the last element of the result tuple will be the cost and the last element will be the model name.
+- Extract the cost and model name from the command function's result if available. The result tuple may take one of two shapes:
+ - **Legacy shape:** `(..., cost, model_name)` — second-to-last element is the cost, last element is the model name.
+ - **Enriched shape:** the last element is a dict (e.g., `{"cost": ..., "model_name": ..., "attempted_models": [...]}`). When the last element is a dict, read cost from `dict["cost"]`, the final model from `dict["model_name"]`, and the ordered fallback chain from `dict["attempted_models"]`.
+- Determine the attempted-models chain using this precedence: (a) the locally captured `command_attempted_models` from step 5 if non-empty (preferred — this is the chain `llm_invoke` aggregated within THIS `@track_cost` invocation, NOT the live `ctx.obj['attempted_models']` which has already been restored to its prior value); otherwise (b) the `attempted_models` key on the result dict if the enriched shape was returned; otherwise (c) an empty list. Reading the live `ctx.obj['attempted_models']` here would re-introduce the cross-command leak this decorator now prevents.
- Collect input and output file paths from the command arguments using `kwargs`. Input file paths should be validated to ensure they exist, and output file paths should be identified by their parameter names starting with 'output'.
-7. **Append Cost Data to CSV File:**
+9. **Append Cost Data to CSV File:**
- Skip CSV writing entirely when running under pytest (`PYTEST_CURRENT_TEST` environment variable is set). This prevents mock return values from polluting the cost CSV during test runs.
- If the output cost file does not exist, create it and write the header row.
- If the file exists, append the cost data to the end of the file.
-- Ensure all columns are included in the CSV file: timestamp, model, command, cost, input files, and output files.
-8. **Handle Exceptions Gracefully:**
-- If any errors occur during cost logging, print an error message using Rich's `rprint` but do not interrupt the command execution.
-9. **Return the Command Result:** Return the result of the original command function.
+- Ensure all columns are included in the CSV file in this exact order: `timestamp, model, command, cost, input_files, output_files, attempted_models`. The `attempted_models` column is appended at the end for backward compatibility with existing readers; serialize the chain as a semicolon-delimited string in chronological order (first attempt → last attempt). Use the empty string when no chain was recorded.
+- **Legacy 6-column CSV migration (header-only peek, O(1) per call):** Because `@track_cost` runs on every CLI command, the existence check for a missing `attempted_models` column MUST be cheap. On every append where the file already has content, open the existing file just long enough to read the **first row only** (the header) using `csv.reader(...)` and `next(reader, None)`. Do NOT call `list(reader)` or otherwise materialize the remaining rows on this path — a long-lived cost CSV would otherwise add O(file size) I/O and memory to every command, making repeated invocations O(n²) over the file's lifetime. Only when the header is present AND `'attempted_models' not in existing_header` should you then consume the remaining rows with `list(reader)` (inside the same `with` block is fine) and rewrite the file with the migrated 7-column header, padding shorter rows so the column count matches. In every other case (header already includes `attempted_models`, or header could not be read), skip the rewrite entirely and proceed straight to opening the file in append mode and writing the new row.
+- **Atomic migration rewrite:** When the migration path fires, do NOT open the target file in `'w'` mode in place. Two concurrent PDD processes hitting a legacy 6-column `cost.csv` on first-run upgrade would race on the in-place rewrite and corrupt the file. Instead, write the migrated header + padded rows to a tempfile created via `tempfile.mkstemp(prefix=".cost.csv.migrate-", dir=)` and then `os.replace(tmp_path, output_cost_path)` — `os.replace` is atomic on a single filesystem (POSIX rename semantics on Linux/macOS, equivalent on Windows when source and destination share a volume). On any exception while writing or replacing, best-effort `os.unlink(tmp_path)` and re-raise.
+- **Cross-process serialization (portable atomic lock with nonce + PID liveness + migration-only requirement):** Atomic rename alone prevents partial writes but NOT lost-update races between concurrent PDD processes. Trace: P1 reads legacy rows; P2 reads legacy rows; P1 `os.replace`s to migrated; P1 appends row C; P2 `os.replace`s to migrated WITHOUT row C — row C is lost. The lock applies ONLY to the read-modify-write migration path. Pure appends to an already-migrated file are safe without the lock because POSIX `O_APPEND` guarantees atomicity for writes up to PIPE_BUF (≥4 KiB; our rows are well under that). Do a cheap pre-lock header peek to decide which path applies. **Migration path (lock required):** acquire an exclusive lock on a sibling `.lock` file via `os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)` for portability (works the same on POSIX and Windows; no `fcntl` dependency). Immediately write a unique `-` **nonce** to the lock file and verify the write completed (partial write → close+unlink+retry — orphaned malformed locks would block every subsequent PDD command for the mtime threshold). Lock acquisition has two failure modes that MUST be handled distinctly: (a) **contended timeout** (`saw_contention=True`) — another live PDD process is in the migration critical section; SKIP the cost row with a loud yellow Rich warning naming the file rather than racing; (b) **unsupported filesystem** (`saw_contention=False`, OSError on `os.open`) — `O_EXCL` semantics aren't available (e.g., some network mounts). Refusing forever would be worse than the theoretical multi-process race, so attempt UNLOCKED migration with a one-line warning that documents the tradeoff. Single-process use is the dominant case on such filesystems and migrates correctly unlocked. **Pre-lock header peek hardening:** the cheap header peek MUST catch `(OSError, csv.Error, UnicodeDecodeError, StopIteration)` — a malformed first row, non-UTF-8 bytes, or an empty file would otherwise crash the peek and silently drop the cost row at the outer `except`. On peek failure, treat as "migration unknown" and fall through to the append path; the csv.writer appends bytes regardless of pre-existing content corruption. **Pure-append path (lock best-effort):** acquire the lock if available so neighbors observe consistent ordering, but on any failure proceed unlocked relying on `O_APPEND` atomicity. **Staleness** is determined primarily by **PID liveness** (`os.kill(pid, 0)` raises `ProcessLookupError` when the holder is gone); a LIVE PID is never stolen regardless of how long the migration takes, so a legitimate slow migration is safe. mtime is consulted only as a long-threshold (~10 minutes) safety-net for malformed lock files with no parseable PID. **Release** in `finally`: `os.close(fd)`, then resolve strongest-evidence-first: (a) lock file gone → no-op; (b) nonce matches ours → unlink (normal release); (c) nonce belongs to a different owner → leave alone (our lock was stolen, the other owner now holds the path); (d) file exists but has no parseable nonce → unlink anyway (we acquired via O_EXCL so the path is ours, and a malformed file would otherwise block other PDD invocations). All best-effort with errors swallowed so cost tracking never breaks the wrapped command.
+10. **Handle Exceptions Gracefully:**
+- If any errors occur during cost logging, print an error message using Rich's `rprint` but do not interrupt the command execution. If the wrapped `func` raised, re-raise that original exception AFTER the CSV row has been written (so the row that mines `exception.attempted_models` and the scoped `command_attempted_models` still lands on disk for failed commands).
+11. **Return the Command Result:** Return the result of the original command function.
% The decorator should be robust and ensure that the main command functionality is not adversely affected by the cost tracking. Use the `functools.wraps` decorator to preserve the metadata of the original function.
diff --git a/pdd/summarize_directory.py b/pdd/summarize_directory.py
index 9f1aa48c9..3c8f13bc5 100644
--- a/pdd/summarize_directory.py
+++ b/pdd/summarize_directory.py
@@ -13,7 +13,7 @@
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
# Internal imports based on package structure
-from .llm_invoke import llm_invoke
+from .llm_invoke import llm_invoke, _propagate_attempted_models_to_ctx
from .load_prompt_template import load_prompt_template
from . import DEFAULT_TIME
@@ -234,6 +234,15 @@ def summarize_directory(
results_data: List[Dict[str, str]] = []
total_cost = 0.0
last_model_name = "cached"
+ # Threaded path only: aggregate per-worker chains and propagate from the
+ # main thread after the executor completes. Worker threads' Click ctx is
+ # thread-local — propagating from inside the worker would not reach the
+ # @track_cost decorator on the parent command. Serial paths
+ # (max_workers == 1) MUST NOT aggregate-and-propagate; llm_invoke's
+ # own ctx propagation (round-4 contract) already lands the chain on
+ # ctx.obj['attempted_models'], and a second propagate would record
+ # each entry twice (e.g. 'm1;m1' for one LLM call).
+ threaded_aggregated_chain: List[str] = []
# Step 6: Iterate through files with progress reporting
total_files = len(files)
@@ -242,7 +251,7 @@ def _compute_rel_path(file_path: str) -> str:
real = os.path.realpath(file_path)
return os.path.relpath(real, base_dir) if base_dir else real
- def _process_file(i: int, file_path: str) -> Tuple[float, str]:
+ def _process_file(i: int, file_path: str) -> Tuple[float, str, List[str]]:
"""Process a single file and accumulate results."""
rel_path = _compute_rel_path(file_path)
return _process_single_file_logic(
@@ -262,11 +271,11 @@ def _process_file(i: int, file_path: str) -> Tuple[float, str]:
from concurrent.futures import ThreadPoolExecutor, as_completed
cost_lock = threading.Lock()
- def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str]:
+ def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str, List[str], Optional[Dict[str, str]]]:
"""Thread-safe wrapper that returns index for ordering."""
rel_path = _compute_rel_path(file_path)
local_results: List[Dict[str, str]] = []
- cost, model = _process_single_file_logic(
+ cost, model, chain = _process_single_file_logic(
file_path,
rel_path,
existing_data,
@@ -277,16 +286,20 @@ def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str]:
verbose,
local_results
)
- return i, cost, model, local_results[0] if local_results else None
+ return i, cost, model, chain, local_results[0] if local_results else None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(_threaded_process, i, fp): i
for i, fp in enumerate(files)
}
+ # Order chains by submission index so the propagated chain matches
+ # file order, not completion order (as_completed returns futures
+ # in whatever order they finish).
+ chains_by_index: Dict[int, List[str]] = {}
completed = 0
for future in as_completed(futures):
- idx, cost, model, result_row = future.result()
+ idx, cost, model, chain, result_row = future.result()
with cost_lock:
total_cost += cost
if model != "cached":
@@ -295,13 +308,16 @@ def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str]:
results_data.append(result_row)
if csv_path:
_flush_csv_to_disk(results_data, csv_path)
+ chains_by_index[idx] = chain
completed += 1
if progress_callback:
progress_callback(completed, total_files)
+ for i in sorted(chains_by_index):
+ threaded_aggregated_chain.extend(chains_by_index[i])
elif progress_callback:
for i, file_path in enumerate(files):
progress_callback(i + 1, total_files)
- cost, model = _process_file(i, file_path)
+ cost, model, _chain = _process_file(i, file_path)
total_cost += cost
if model != "cached":
last_model_name = model
@@ -318,7 +334,7 @@ def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str]:
) as progress:
task = progress.add_task("[cyan]Processing files...", total=len(files))
for i, file_path in enumerate(files):
- cost, model = _process_file(i, file_path)
+ cost, model, _chain = _process_file(i, file_path)
total_cost += cost
if model != "cached":
last_model_name = model
@@ -327,13 +343,19 @@ def _threaded_process(i: int, file_path: str) -> Tuple[int, float, str]:
progress.advance(task)
else:
for i, file_path in enumerate(files):
- cost, model = _process_file(i, file_path)
+ cost, model, _chain = _process_file(i, file_path)
total_cost += cost
if model != "cached":
last_model_name = model
if csv_path:
_flush_csv_to_disk(results_data, csv_path)
+ # Best-effort propagation to the main thread's Click context for the
+ # THREADED path only. The helper is a no-op when there's no active
+ # Click context.
+ if max_workers > 1 and threaded_aggregated_chain:
+ _propagate_attempted_models_to_ctx(threaded_aggregated_chain)
+
# Step 7: Merge in existing entries that were not part of this scan.
# This preserves cached summaries for files outside the current
# directory_path / glob scope so they are not silently dropped.
@@ -374,13 +396,22 @@ def _process_single_file_logic(
time: float,
verbose: bool,
results_data: List[Dict[str, str]]
-) -> Tuple[float, str]:
+) -> Tuple[float, str, List[str]]:
"""
Helper function to process a single file: read, hash, check cache, summarize if needed.
- Returns (cost, model_name).
+ Returns (cost, model_name, attempted_models).
+
+ The third element is the attempted-model fallback chain reported by
+ ``llm_invoke`` for this file's summarization call (empty list on cache
+ hits). Callers running in worker threads must aggregate these chains
+ and propagate to the main-thread Click context via
+ ``_propagate_attempted_models_to_ctx`` so ``@track_cost`` records the
+ full chain for the parent command — worker threads' ``click.get_current_context``
+ cannot see the parent ctx.
"""
cost = 0.0
model_name = "cached"
+ attempted_models: List[str] = []
try:
# Step 6a: Read file
@@ -449,6 +480,9 @@ def _process_single_file_logic(
cost = llm_result.get('cost', 0.0)
model_name = llm_result.get('model_name', "unknown")
+ raw_chain = llm_result.get('attempted_models', []) or []
+ if isinstance(raw_chain, list):
+ attempted_models = [str(m) for m in raw_chain if m]
# Print verbose per-file summary (only for freshly summarized files)
if verbose and not cache_hit:
@@ -467,6 +501,20 @@ def _process_single_file_logic(
})
except Exception as e:
+ # Harvest llm_invoke's exception-attached chain (set by the round-2
+ # pre-loop wrapper and the all-candidates RuntimeError). Worker
+ # threads in summarize_directory's parallel path can't reach the
+ # parent Click ctx, so the only way to recover the attempted chain
+ # from a failed file is via this exception attribute. Use the same
+ # dedup pattern as llm_invoke's chain-building loop (skip only when
+ # the new entry equals the last appended entry; do NOT pre-dedupe
+ # across the boundary — that was the round-5 anti-pattern).
+ failed_chain = getattr(e, 'attempted_models', None) or []
+ if isinstance(failed_chain, list):
+ for entry in failed_chain:
+ text = str(entry)
+ if text and (not attempted_models or attempted_models[-1] != text):
+ attempted_models.append(text)
console.print(f"[bold red]Error processing file {file_path}:[/bold red] {e}")
results_data.append({
'full_path': rel_path,
@@ -475,5 +523,5 @@ def _process_single_file_logic(
'dependencies': "[]",
'content_hash': "error"
})
-
- return cost, model_name
+
+ return cost, model_name, attempted_models
diff --git a/pdd/track_cost.py b/pdd/track_cost.py
index 2818e22de..5b68af9fd 100644
--- a/pdd/track_cost.py
+++ b/pdd/track_cost.py
@@ -1,18 +1,249 @@
import functools
+import time
+import uuid
from datetime import datetime
import csv
import os
+import tempfile
import click
from rich import print as rprint
-from typing import Any, Tuple
+from typing import Any, Optional, Tuple, List
+
+# Cross-platform exclusive lock for serializing CSV migration + append between
+# concurrent PDD processes. Uses O_CREAT | O_EXCL semantics which works on
+# POSIX and Windows alike, so Windows no longer falls through to unlocked
+# writes (the previous fcntl-only implementation lost rows during concurrent
+# legacy-CSV migration on Windows).
+#
+# Safety properties beyond plain O_EXCL:
+# - Each acquire writes a unique pid-uuid NONCE to the lock file. Release
+# reads the file back and only unlinks if the nonce still matches ours.
+# This prevents the release-after-stale-steal race: if process A's lock
+# is stolen by B, A's eventual release no longer deletes B's lock file.
+# - Staleness is determined by PID liveness (os.kill(pid, 0)), NOT by lock
+# file mtime alone. A legitimate slow migration whose holder is still
+# running will not be stolen even after many minutes. mtime is used only
+# as a long-threshold safety net for malformed lock files or Windows
+# filesystems where the PID alive-check is unreliable.
+_LOCK_RETRY_MAX = 300 # 300 * 0.1s = 30s max wait under contention
+_LOCK_RETRY_INTERVAL = 0.1
+_LOCK_STALE_SECONDS = 600 # mtime safety-net for unparseable nonces only;
+ # the primary staleness signal is PID liveness.
+
+
+def _is_pid_alive(pid: int) -> bool:
+ """Return True when the OS still has a process for ``pid``.
+
+ On POSIX, ``os.kill(pid, 0)`` raises ``ProcessLookupError`` when the
+ process is gone and ``PermissionError`` when it exists but we may not
+ signal it. On Windows (where ``signal 0`` semantics differ) and any
+ other OSError, return ``True`` conservatively so the lock holder gets
+ the benefit of the doubt — the mtime safety-net handles truly stuck
+ locks via ``_LOCK_STALE_SECONDS``.
+ """
+ if pid <= 0:
+ return False
+ try:
+ os.kill(pid, 0)
+ return True
+ except ProcessLookupError:
+ return False
+ except PermissionError:
+ # Process exists but we can't signal it — still alive.
+ return True
+ except OSError:
+ # Unsupported on this platform; fall back to mtime check.
+ return True
+
+
+def _read_lock_owner(lock_path: str) -> Optional[Tuple[int, str]]:
+ """Read ``(pid, nonce)`` from an existing lock file.
+
+ Returns ``None`` when the file is missing, unreadable, or malformed.
+ Used by :func:`_acquire_atomic_lock` to decide whether the holder is
+ still alive, and by :func:`_release_atomic_lock` to verify ownership.
+ """
+ try:
+ with open(lock_path, 'r', encoding='utf-8') as f:
+ content = f.read(128).strip()
+ except OSError:
+ return None
+ if not content:
+ return None
+ # Format: "-"
+ head, _, tail = content.partition('-')
+ if not head or not tail:
+ return None
+ try:
+ pid = int(head)
+ except ValueError:
+ return None
+ return pid, content
+
+
+def _is_stale_lock(lock_path: str) -> bool:
+ """Return True when the lock file should be reclaimed.
+
+ A lock is stale when (a) the PID it records is no longer alive, or
+ (b) the lock file is older than ``_LOCK_STALE_SECONDS`` AND has no
+ parseable PID (malformed). A LIVE PID is never stolen — even after
+ many minutes — so a legitimate slow migration is safe.
+ """
+ owner = _read_lock_owner(lock_path)
+ if owner is not None:
+ pid, _ = owner
+ return not _is_pid_alive(pid)
+ # Malformed or unreadable lock — fall back to mtime threshold so a
+ # truly stuck lock file can still be recovered eventually.
+ try:
+ age = time.time() - os.path.getmtime(lock_path)
+ return age > _LOCK_STALE_SECONDS
+ except OSError:
+ return False
+
+
+def _acquire_atomic_lock(
+ lock_path: str,
+) -> Tuple[Optional[Tuple[int, str]], bool]:
+ """Cross-platform best-effort exclusive lock via O_CREAT|O_EXCL.
+
+ Returns ``(handle, contended)`` where:
+ - ``handle`` is ``(fd, nonce)`` on success, ``None`` on failure;
+ - ``contended`` is ``True`` only when failure was due to another
+ live holder timing out our retry budget (so the caller can warn
+ the user about real contention vs. silent filesystem fallback).
+
+ Staleness is determined by PID liveness; mtime is only consulted as a
+ safety-net when the lock file is malformed.
+ """
+ nonce = f"{os.getpid()}-{uuid.uuid4().hex}"
+ nonce_bytes = nonce.encode()
+ saw_contention = False
+ for _ in range(_LOCK_RETRY_MAX):
+ try:
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
+ except FileExistsError:
+ saw_contention = True
+ if _is_stale_lock(lock_path):
+ try:
+ os.unlink(lock_path)
+ except OSError:
+ pass
+ continue
+ time.sleep(_LOCK_RETRY_INTERVAL)
+ continue
+ except OSError:
+ # Filesystem doesn't support O_EXCL semantics; degrade
+ # silently (no contention was observed, just unsupported).
+ return None, False
+
+ # Now we hold the fd. Nonce write MUST succeed for ownership to be
+ # verifiable on release and staleness via PID-alive on contention.
+ # If it fails (disk full, fd revoked, process signaled mid-write),
+ # close + unlink so we don't leak a malformed lock file other
+ # processes will wait the mtime threshold on.
+ try:
+ written = os.write(fd, nonce_bytes)
+ if written != len(nonce_bytes):
+ raise OSError("partial nonce write")
+ except OSError:
+ try:
+ os.close(fd)
+ except OSError:
+ pass
+ try:
+ os.unlink(lock_path)
+ except OSError:
+ pass
+ time.sleep(_LOCK_RETRY_INTERVAL)
+ continue
+ return (fd, nonce), False
+ return None, saw_contention
+
+
+def _release_atomic_lock(handle: Optional[Tuple[int, str]], lock_path: str) -> None:
+ """Release the lock acquired by :func:`_acquire_atomic_lock`.
+
+ Resolution rules — strongest-evidence-first:
+ - If the lock file is gone, no-op (already cleaned).
+ - If the file's nonce matches ours, unlink (normal release).
+ - If the file's nonce DOES NOT match ours, leave alone (our lock
+ was stolen during execution; the new owner now holds the path).
+ - If the file exists but is unreadable / has no parseable nonce,
+ unlink anyway: we acquired the path via O_EXCL and the malformed
+ content can only be our own write that didn't complete (the
+ acquire path now rejects partial nonce writes, so in practice
+ this only fires when the lock file was truncated by something
+ external — leaving it would block future PDD invocations).
+
+ Best-effort: errors during close/read/unlink are swallowed so cost
+ tracking never breaks the wrapped command.
+ """
+ if handle is None:
+ return
+ fd, our_nonce = handle
+ try:
+ os.close(fd)
+ except OSError:
+ pass
+ if not os.path.exists(lock_path):
+ return
+ owner = _read_lock_owner(lock_path)
+ if owner is None:
+ # File exists but content is unparseable; we hold the
+ # O_EXCL-acquired fd, so clean up to avoid leaking the lock.
+ try:
+ os.unlink(lock_path)
+ except OSError:
+ pass
+ return
+ _, recorded = owner
+ if recorded == our_nonce:
+ try:
+ os.unlink(lock_path)
+ except OSError:
+ pass
+
+
+__all__ = ['track_cost', 'extract_cost_and_model', 'collect_files', 'wrapper', 'looks_like_file']
+
+def wrapper(*args, **kwargs):
+ """Dummy wrapper function to satisfy architecture export requirements."""
+ pass
+
+def looks_like_file(path_str):
+ """Check if string looks like a file path."""
+ if not path_str or not isinstance(path_str, str):
+ return False
+ # Has file extension or exists
+ return '.' in os.path.basename(path_str) or os.path.isfile(path_str)
def track_cost(func):
@functools.wraps(func)
- def wrapper(*args, **kwargs):
+ def _wrapper(*args, **kwargs):
ctx = click.get_current_context()
if ctx is None:
return func(*args, **kwargs)
+ # Scope attempted_models to a single @track_cost invocation so a
+ # chained / batched CLI run never inherits the fallback chain from a
+ # previous command on the same shared ctx.obj. Snapshot any prior
+ # value, reset it, let llm_invoke aggregate within this command,
+ # then restore the snapshot after the row is written.
+ attempted_models_scoped = False
+ prior_attempted_models_present = False
+ prior_attempted_models = None
+ if ctx.obj is not None and hasattr(ctx.obj, '__setitem__'):
+ try:
+ existing = ctx.obj.get('attempted_models') if hasattr(ctx.obj, 'get') else None
+ if existing is not None:
+ prior_attempted_models = existing
+ prior_attempted_models_present = True
+ ctx.obj['attempted_models'] = []
+ attempted_models_scoped = True
+ except Exception:
+ attempted_models_scoped = False
+
start_time = datetime.now()
result = None
exception_raised = None
@@ -42,6 +273,43 @@ def wrapper(*args, **kwargs):
finally:
end_time = datetime.now()
+ # Capture the chain THIS command accumulated before we restore the
+ # prior shared value. Without this snapshot, a later @track_cost
+ # invocation on the same ctx.obj would see leftover state from a
+ # prior command (issue #1086 regression).
+ #
+ # Capture regardless of whether the snapshot/restore guard at
+ # command entry fired (attempted_models_scoped). ctx.obj may have
+ # been None at entry and created inside func via
+ # ctx.ensure_object(dict), so a missing snapshot is NOT evidence
+ # that no chain was recorded — read ctx.obj['attempted_models']
+ # whenever ctx.obj is now non-None.
+ command_attempted_models: List[str] = []
+ try:
+ if ctx is not None and ctx.obj is not None and hasattr(ctx.obj, 'get'):
+ current = ctx.obj.get('attempted_models')
+ if isinstance(current, list):
+ command_attempted_models = [str(m) for m in current if m]
+ except Exception:
+ command_attempted_models = []
+
+ # Restore the prior shared value (or remove the key entirely) so
+ # the next command starts with a clean slate.
+ if attempted_models_scoped and ctx.obj is not None:
+ try:
+ if prior_attempted_models_present:
+ ctx.obj['attempted_models'] = prior_attempted_models
+ else:
+ if hasattr(ctx.obj, 'pop'):
+ ctx.obj.pop('attempted_models', None)
+ else:
+ try:
+ del ctx.obj['attempted_models']
+ except Exception:
+ pass
+ except Exception:
+ pass
+
try:
# Always collect files for core dump, even if output_cost is not set
input_files, output_files = collect_files(args, kwargs)
@@ -59,16 +327,50 @@ def wrapper(*args, **kwargs):
files_set.add(abs_path)
ctx.obj['core_dump_files'] = files_set
- # Check if we need to write cost tracking (only on success)
- if exception_raised is None:
+ # Decide whether to write a cost-tracking row. On success we
+ # always write. On failure we still write a row when there's
+ # any recorded fallback history to surface (either on the
+ # raised exception or on ctx.obj) so users can see which
+ # candidate models were attempted before the command failed.
+ exception_attempted: List[str] = []
+ if exception_raised is not None:
+ raw = getattr(exception_raised, 'attempted_models', None)
+ if isinstance(raw, list):
+ exception_attempted = [str(m) for m in raw if m]
+
+ should_write = (
+ exception_raised is None
+ or bool(exception_attempted)
+ or bool(command_attempted_models)
+ )
+
+ if should_write:
if ctx.obj and hasattr(ctx.obj, 'get'):
output_cost_path = ctx.obj.get('output_cost') or os.getenv('PDD_OUTPUT_COST_PATH')
else:
output_cost_path = os.getenv('PDD_OUTPUT_COST_PATH')
if output_cost_path and os.environ.get('PYTEST_CURRENT_TEST') is None:
- command_name = ctx.command.name
- cost, model_name = extract_cost_and_model(result)
+ command_name = ctx.command.name if ctx.command else "unknown"
+ if exception_raised is None:
+ cost, model_name, result_attempted = extract_cost_and_model(result)
+ else:
+ # On total failure there is no successful result
+ # to mine, but ctx/exception may still carry the
+ # attempted chain.
+ cost, model_name, result_attempted = '', '', exception_attempted
+
+ # Determine attempted_models chain. Use only the chain
+ # accumulated within THIS @track_cost invocation so
+ # cost.csv rows never inherit a previous command's
+ # fallback history (issue #1086 regression).
+ attempted_models_list = []
+ if command_attempted_models:
+ attempted_models_list = command_attempted_models
+ elif result_attempted:
+ attempted_models_list = result_attempted
+
+ attempted_models_str = ';'.join(attempted_models_list) if attempted_models_list else ''
timestamp = start_time.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]
@@ -79,16 +381,155 @@ def wrapper(*args, **kwargs):
'cost': cost,
'input_files': ';'.join(input_files),
'output_files': ';'.join(output_files),
+ 'attempted_models': attempted_models_str,
}
+ fieldnames = ['timestamp', 'model', 'command', 'cost', 'input_files', 'output_files', 'attempted_models']
file_has_content = os.path.isfile(output_cost_path) and os.path.getsize(output_cost_path) > 0
- fieldnames = ['timestamp', 'model', 'command', 'cost', 'input_files', 'output_files']
- with open(output_cost_path, 'a', newline='', encoding='utf-8') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- if not file_has_content:
- writer.writeheader()
- writer.writerow(row)
+ # Cheap pre-lock peek: does the file need migration?
+ # Migration is the only operation that requires
+ # cross-process serialization (it's a read-modify-
+ # write of the whole file). A pure append to an
+ # already-migrated file is safe without a lock on
+ # POSIX because O_APPEND guarantees atomicity for
+ # writes up to PIPE_BUF (≥4096 bytes; our rows are
+ # well under that). This split means a stuck or
+ # filesystem-unsupported lock no longer reopens the
+ # lost-row race on the normal append path.
+ needs_migration_likely = False
+ peek_failed = False
+ if file_has_content:
+ try:
+ with open(output_cost_path, 'r', newline='', encoding='utf-8') as _peek:
+ _peek_header = next(csv.reader(_peek), None)
+ if _peek_header is not None and 'attempted_models' not in _peek_header:
+ needs_migration_likely = True
+ except (OSError, csv.Error, UnicodeDecodeError, StopIteration):
+ # Pre-lock peek can fail when the existing
+ # CSV is malformed (csv.Error), not valid
+ # UTF-8 (UnicodeDecodeError), or unreadable
+ # (OSError). Don't abort cost logging on
+ # such a file — fall through to the append
+ # path so the new row still lands. The
+ # csv.writer can append regardless of the
+ # existing content.
+ peek_failed = True
+
+ # Lock acquisition policy:
+ # - Migration needed AND lock held → migrate +
+ # append under lock (the read-modify-write
+ # race needs serialization).
+ # - Migration needed AND lock contended (real
+ # concurrent holder) → SKIP the cost row with
+ # a loud warning; better than racing.
+ # - Migration needed AND lock unsupported by
+ # the filesystem → attempt UNLOCKED migration
+ # with a one-line warning. Filesystems that
+ # reject O_EXCL are typically single-process
+ # environments (e.g. some network mounts);
+ # refusing forever would be worse than the
+ # theoretical multi-process race.
+ # - Pure append → acquire best-effort; on any
+ # failure, fall through to unlocked append
+ # (relying on POSIX O_APPEND atomicity).
+ lock_path = output_cost_path + ".lock"
+ lock_handle = None
+ lock_contended = False
+ if needs_migration_likely:
+ lock_handle, lock_contended = _acquire_atomic_lock(lock_path)
+ if lock_handle is None:
+ if lock_contended:
+ rprint(
+ "[yellow]track_cost: could not acquire "
+ "CSV lock for legacy migration after 30s "
+ "(another PDD process is still holding it); "
+ "skipping this cost row to avoid corrupting "
+ f"{output_cost_path}.[/yellow]"
+ )
+ # Skip the write entirely under real contention.
+ if exception_raised is not None:
+ raise exception_raised
+ return result
+ else:
+ # Unsupported filesystem: a single-process
+ # unlocked migration is the lesser evil.
+ # Document the tradeoff in the warning.
+ rprint(
+ "[yellow]track_cost: lock not supported on "
+ f"this filesystem; migrating {output_cost_path} "
+ "without serialization (safe for single-process "
+ "use; concurrent PDD writers on this filesystem "
+ "can lose rows during legacy migration).[/yellow]"
+ )
+ else:
+ lock_handle, lock_contended = _acquire_atomic_lock(lock_path)
+ # On pure-append, lock failure is acceptable —
+ # POSIX O_APPEND atomicity handles small rows.
+
+ try:
+ # If file exists with an older header that
+ # doesn't include 'attempted_models', migrate it
+ # in place so DictReader can expose the new
+ # column properly. Peek only the header on
+ # every call; load and rewrite rows ONLY when
+ # migration is actually needed, to keep append
+ # cost O(1) over the lifetime of a long-lived
+ # cost CSV. RE-READ under the lock — another
+ # process may have migrated between our
+ # pre-lock check and lock acquisition.
+ # Recompute file_has_content too, in case another
+ # process just created/extended the file.
+ file_has_content = os.path.isfile(output_cost_path) and os.path.getsize(output_cost_path) > 0
+ if file_has_content:
+ existing_header = None
+ existing_rows = []
+ needs_migration = False
+ try:
+ with open(output_cost_path, 'r', newline='', encoding='utf-8') as existing:
+ reader = csv.reader(existing)
+ existing_header = next(reader, None)
+ if existing_header is not None and 'attempted_models' not in existing_header:
+ needs_migration = True
+ existing_rows = list(reader)
+ except Exception:
+ existing_header = None
+ existing_rows = []
+ needs_migration = False
+
+ if needs_migration:
+ migrated_fieldnames = list(existing_header) + ['attempted_models']
+ # Write the migrated CSV to a tempfile
+ # in the SAME directory as the original,
+ # then os.replace it atomically.
+ parent_dir = os.path.dirname(os.path.abspath(output_cost_path)) or "."
+ tmp_fd, tmp_path = tempfile.mkstemp(
+ prefix=".cost.csv.migrate-", dir=parent_dir
+ )
+ try:
+ with os.fdopen(tmp_fd, 'w', newline='', encoding='utf-8') as tmp_file:
+ migrate_writer = csv.writer(tmp_file)
+ migrate_writer.writerow(migrated_fieldnames)
+ for existing_row in existing_rows:
+ # Pad shorter rows so column count matches.
+ padded = list(existing_row) + [''] * (len(migrated_fieldnames) - len(existing_row))
+ migrate_writer.writerow(padded)
+ os.replace(tmp_path, output_cost_path)
+ except Exception:
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
+ raise
+ fieldnames = migrated_fieldnames
+
+ with open(output_cost_path, 'a', newline='', encoding='utf-8') as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
+ if not file_has_content:
+ writer.writeheader()
+ writer.writerow(row)
+ finally:
+ _release_atomic_lock(lock_handle, lock_path)
except Exception as e:
rprint(f"[red]Error tracking cost: {e}[/red]")
@@ -99,12 +540,16 @@ def wrapper(*args, **kwargs):
return result
- return wrapper
+ return _wrapper
-def extract_cost_and_model(result: Any) -> Tuple[Any, str]:
- if isinstance(result, tuple) and len(result) >= 3:
- return result[-2], result[-1]
- return '', ''
+def extract_cost_and_model(result: Any) -> Tuple[Any, str, List[str]]:
+ if isinstance(result, tuple) and len(result) >= 1:
+ last_elem = result[-1]
+ if isinstance(last_elem, dict) and 'cost' in last_elem and 'model_name' in last_elem:
+ return last_elem.get('cost', 0), last_elem.get('model_name', ''), last_elem.get('attempted_models', [])
+ elif len(result) >= 3:
+ return result[-2], result[-1], []
+ return '', '', []
def collect_files(args, kwargs):
input_files = []
@@ -123,14 +568,6 @@ def collect_files(args, kwargs):
'output_test', 'output_code', 'output_results'
}
- # Helper to check if something looks like a file path
- def looks_like_file(path_str):
- """Check if string looks like a file path."""
- if not path_str or not isinstance(path_str, str):
- return False
- # Has file extension or exists
- return '.' in os.path.basename(path_str) or os.path.isfile(path_str)
-
# Collect from kwargs (most reliable since Click uses named parameters)
for k, v in kwargs.items():
if k in ('ctx', 'context', 'output_cost'):
@@ -177,4 +614,3 @@ def looks_like_file(path_str):
input_files.append(item)
return input_files, output_files
-
diff --git a/tests/test_auto_deps_main.py b/tests/test_auto_deps_main.py
index 2c1a58e19..52a320715 100644
--- a/tests/test_auto_deps_main.py
+++ b/tests/test_auto_deps_main.py
@@ -1236,3 +1236,55 @@ def test_auto_deps_finalization_captures_attributed_include_deps(
assert len(include_deps["parent.py"]) == 64, (
"include_deps hash should be a SHA-256 hex digest"
)
+
+
+@patch("pdd.auto_deps_main.construct_paths")
+@patch("pdd.auto_deps_main.insert_includes")
+def test_auto_deps_propagates_attempted_models_to_ctx(
+ mock_insert_includes,
+ mock_construct_paths,
+ mock_ctx,
+ tmp_dir,
+):
+ """Regression (PR #1087): auto_deps_main delegates the LLM call to
+ insert_includes, which calls llm_invoke. llm_invoke already
+ best-effort propagates the attempted_models chain to
+ ctx.obj['attempted_models'] on success (per the round-4 'at most
+ once' contract), so when the @track_cost decorator on the
+ auto-deps command reads ctx.obj['attempted_models'] at finally-time
+ it sees the chain. Simulate insert_includes calling
+ _propagate_attempted_models_to_ctx as llm_invoke would, and assert
+ the chain lands on ctx.
+ """
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ output_path = os.path.join(tmp_dir, "output.prompt")
+ csv_path = os.path.join(tmp_dir, "project_dependencies.csv")
+ mock_construct_paths.return_value = _make_construct_paths_return(
+ output_path, csv_path
+ )
+
+ def _insert_includes_side_effect(*args, **kwargs):
+ # llm_invoke (called from inside insert_includes) propagates the
+ # chain to the current Click context. Mirror that behavior here so
+ # the test exercises the real propagation path that production
+ # code follows.
+ _propagate_attempted_models_to_ctx(['gpt-4', 'gpt-4o'])
+ return _make_insert_includes_return()
+
+ mock_insert_includes.side_effect = _insert_includes_side_effect
+
+ with mock_ctx:
+ auto_deps_main(
+ ctx=mock_ctx,
+ prompt_file="sample_prompt_python.prompt",
+ directory_path="context/",
+ auto_deps_csv_path=None,
+ output=None,
+ force_scan=False,
+ )
+
+ assert mock_ctx.obj.get('attempted_models') == ['gpt-4', 'gpt-4o'], (
+ f"Expected ctx.obj['attempted_models']=['gpt-4','gpt-4o'], "
+ f"got {mock_ctx.obj.get('attempted_models')!r}"
+ )
diff --git a/tests/test_fix_error_loop.py b/tests/test_fix_error_loop.py
index d5c94e50d..d2650dc19 100644
--- a/tests/test_fix_error_loop.py
+++ b/tests/test_fix_error_loop.py
@@ -790,10 +790,17 @@ def capture_cwd_mock(**kwargs):
with patch("pdd.fix_error_loop.run_agentic_fix", side_effect=capture_cwd_mock) as mock_agent, \
patch("pdd.fix_error_loop.run_pytest_on_file") as mock_pytest, \
+ patch("pdd.fix_error_loop.fix_errors_from_unit_tests") as mock_fix, \
patch("subprocess.run") as mock_subprocess:
# Make tests have failures to trigger agentic fallback
# since success = (fails == 0 and errors == 0) will be False
- mock_pytest.return_value = (1, 0, 0, "1 failed")
+ mock_pytest.side_effect = [
+ (1, 0, 0, "1 failed"),
+ (1, 0, 0, "still failed"),
+ ]
+ # Keep this unit test isolated from real LLM/cloud auth. The behavior
+ # under test is only the cwd passed to the final agentic fallback.
+ mock_fix.return_value = (False, False, "", "", "mock analysis", 0.0, "mock-model")
# Mock subprocess for verification program
mock_subprocess.return_value = subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="")
@@ -1244,10 +1251,174 @@ def test_cloud_fix_errors_success(mock_cloud_config, mock_requests):
result = cloud_fix_errors(
"test", "code", "prompt", "error", "err_file", 0.5, 0.1
)
-
- assert result == (True, True, "new_test", "new_code", "fixed it", 0.05, "cloud-gpt")
+
+ # Non-conforming server (no attemptedModels): degraded fallback chain
+ # is the single-entry [cloud:].
+ assert result == (
+ True, True, "new_test", "new_code", "fixed it", 0.05, "cloud-gpt",
+ ["cloud:cloud-gpt"],
+ )
mock_requests.post.assert_called_once()
+def test_cloud_fix_errors_parses_attempted_models(mock_cloud_config, mock_requests):
+ """Regression (PR #1087, issue #1086): the cloud /fixCode response
+ includes the full chronological cloud-side model chain in
+ `attemptedModels`. cloud_fix_errors MUST parse it and return it as
+ the 8th tuple slot, with each entry prefixed by `cloud:`. The
+ aliases `attempted_models` and `modelChain` are also accepted for
+ backward compatibility.
+ """
+ mock_cloud_config.get_jwt_token.return_value = "fake_token"
+ mock_cloud_config.get_endpoint_url.return_value = "http://api/fix"
+
+ # Canonical camelCase key.
+ mock_response = MagicMock()
+ mock_response.json.return_value = {
+ "updateUnitTest": True,
+ "updateCode": True,
+ "fixedUnitTest": "new_test",
+ "fixedCode": "new_code",
+ "analysis": "fixed it",
+ "totalCost": 0.05,
+ "modelName": "gemini-2.5-pro",
+ "attemptedModels": ["gemini-2.5-flash", "gemini-2.5-pro"],
+ }
+ mock_requests.post.return_value = mock_response
+
+ result = cloud_fix_errors("t", "c", "p", "e", "f", 0.5, 0.1)
+
+ assert result[7] == ["cloud:gemini-2.5-flash", "cloud:gemini-2.5-pro"], (
+ f"Expected prefixed cloud chain in slot 7, got {result[7]!r}"
+ )
+
+ # snake_case alias.
+ mock_response.json.return_value = {
+ "updateUnitTest": True,
+ "updateCode": True,
+ "fixedUnitTest": "new_test",
+ "fixedCode": "new_code",
+ "analysis": "fixed it",
+ "totalCost": 0.05,
+ "modelName": "gpt-4o",
+ "attempted_models": ["gpt-4", "gpt-4o"],
+ }
+ result = cloud_fix_errors("t", "c", "p", "e", "f", 0.5, 0.1)
+ assert result[7] == ["cloud:gpt-4", "cloud:gpt-4o"]
+
+ # Legacy modelChain alias.
+ mock_response.json.return_value = {
+ "updateUnitTest": True,
+ "updateCode": True,
+ "fixedUnitTest": "new_test",
+ "fixedCode": "new_code",
+ "analysis": "fixed it",
+ "totalCost": 0.05,
+ "modelName": "claude-sonnet",
+ "modelChain": ["claude-opus", "claude-sonnet"],
+ }
+ result = cloud_fix_errors("t", "c", "p", "e", "f", 0.5, 0.1)
+ assert result[7] == ["cloud:claude-opus", "cloud:claude-sonnet"]
+
+
+def test_cloud_fix_errors_timeout_attaches_attempted_models(
+ mock_cloud_config, mock_requests
+):
+ """Regression (codex round-9, P2): when cloud_fix_errors raises after
+ the HTTP request was attempted (e.g. timeout), the RuntimeError MUST
+ carry ``attempted_models=['cloud:auto']`` so the caller in
+ fix_error_loop can recover the cloud attempt and propagate it to ctx
+ for @track_cost (issue #1086).
+ """
+ import requests as _requests
+
+ mock_cloud_config.get_jwt_token.return_value = "fake_token"
+ mock_cloud_config.get_endpoint_url.return_value = "http://api/fix"
+ mock_requests.exceptions.Timeout = _requests.exceptions.Timeout
+ mock_requests.post.side_effect = _requests.exceptions.Timeout("read timed out")
+
+ with pytest.raises(RuntimeError) as exc_info:
+ cloud_fix_errors("t", "c", "p", "e", "f", 0.5, 0.1)
+
+ assert getattr(exc_info.value, "attempted_models", None) == ["cloud:auto"], (
+ f"Expected cloud:auto sentinel on timeout RuntimeError, got "
+ f"{getattr(exc_info.value, 'attempted_models', None)!r}"
+ )
+
+
+def test_fix_error_loop_propagates_failed_cloud_attempt_to_ctx(
+ mock_console, mock_files
+):
+ """Regression (codex round-9, P2): when cloud_fix_errors raises and
+ fix_error_loop falls back to local, the cloud attempt
+ (``e.attempted_models``) MUST be propagated to ctx BEFORE the local
+ fallback runs. Otherwise cost.csv records only the local model and
+ loses the cloud attempt that actually happened.
+ """
+ from unittest.mock import patch as _patch
+ code, test, prompt = mock_files
+
+ cloud_err = RuntimeError("simulated cloud timeout")
+ cloud_err.attempted_models = ["cloud:auto"]
+
+ with _patch("pdd.fix_error_loop.cloud_fix_errors", side_effect=cloud_err), \
+ _patch("pdd.fix_error_loop.fix_errors_from_unit_tests") as mock_local, \
+ _patch("pdd.fix_error_loop.run_pytest_on_file") as mock_pytest, \
+ _patch("pdd.fix_error_loop.subprocess") as mock_subprocess, \
+ _patch("pdd.fix_error_loop._propagate_attempted_models_to_ctx") as mock_propagate:
+ mock_pytest.side_effect = [(1, 0, 0, "Fail"), (0, 0, 0, "Pass")]
+ mock_local.return_value = (
+ True, True, "fixed_test", "fixed_code", "local fallback analysis",
+ 0.02, "local-model",
+ )
+ mock_subprocess.run.return_value.returncode = 0
+
+ fix_error_loop(
+ test, code, prompt, "prompt", "verify.py", 0.5, 0.1, 5, 1.0,
+ use_cloud=True,
+ )
+
+ # The cloud failure path must have propagated the recovered chain.
+ propagated_chains = [c.args[0] for c in mock_propagate.call_args_list]
+ assert ["cloud:auto"] in propagated_chains, (
+ f"Expected ['cloud:auto'] in propagated chains; got {propagated_chains}"
+ )
+
+
+def test_cloud_fix_errors_scalar_attempted_models_is_wrapped(
+ mock_cloud_config, mock_requests
+):
+ """Regression (3rd human review pass): a non-conforming cloud server
+ that returns ``attemptedModels`` as a scalar string (e.g. ``"gpt-4"``)
+ instead of a list must not be iterated character-by-character. The
+ chain must be wrapped to a single-element list before the
+ ``cloud:`` prefix comprehension, so the cost CSV records
+ ``['cloud:gpt-4']`` not ``['cloud:g','cloud:p','cloud:t','cloud:-','cloud:4']``.
+ """
+ mock_cloud_config.get_jwt_token.return_value = "fake_token"
+ mock_cloud_config.get_endpoint_url.return_value = "http://api/fix"
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.raise_for_status = MagicMock()
+ mock_response.json.return_value = {
+ "updateUnitTest": False,
+ "updateCode": True,
+ "fixedUnitTest": "",
+ "fixedCode": "fixed",
+ "analysis": "ok",
+ "totalCost": 0.01,
+ "modelName": "gpt-4",
+ "attemptedModels": "gpt-4", # scalar string — non-conforming server
+ }
+ mock_requests.post.return_value = mock_response
+
+ result = cloud_fix_errors("t", "c", "p", "e", "f", 0.5, 0.1)
+
+ assert result[7] == ["cloud:gpt-4"], (
+ f"Scalar attemptedModels must be wrapped to a single-element list; "
+ f"got {result[7]!r}"
+ )
+
+
def test_cloud_fix_errors_no_token(mock_cloud_config):
"""Test error when no JWT token is available."""
mock_cloud_config.get_jwt_token.return_value = None
@@ -1364,7 +1535,7 @@ def test_fix_error_loop_cloud_mode(mock_subprocess, mock_cloud, mock_pytest, moc
"""Test that use_cloud=True calls cloud_fix_errors."""
code, test, prompt = mock_files
mock_pytest.side_effect = [(1, 0, 0, "Fail"), (0, 0, 0, "Pass")]
- mock_cloud.return_value = (False, True, "", "cloud_code", "analysis", 0.2, "cloud-model")
+ mock_cloud.return_value = (False, True, "", "cloud_code", "analysis", 0.2, "cloud-model", ["cloud:cloud-model"])
mock_subprocess.return_value.returncode = 0
success, _, _, attempts, cost, _ = fix_error_loop(
diff --git a/tests/test_fix_main.py b/tests/test_fix_main.py
index 611898dd7..58cba4349 100644
--- a/tests/test_fix_main.py
+++ b/tests/test_fix_main.py
@@ -1828,7 +1828,16 @@ def test_cloud_fix_errors_success(mock_get_jwt, mock_post):
}
mock_post.return_value = mock_response
- update_ut, update_code, fixed_ut, fixed_code, analysis, cost, model = cloud_fix_errors(
+ (
+ update_ut,
+ update_code,
+ fixed_ut,
+ fixed_code,
+ analysis,
+ cost,
+ model,
+ attempted_models,
+ ) = cloud_fix_errors(
unit_test="original unit test",
code="original code",
prompt="test prompt",
@@ -1848,6 +1857,8 @@ def test_cloud_fix_errors_success(mock_get_jwt, mock_post):
assert analysis == "Analysis of fix"
assert cost == 0.05
assert model == "cloud-fix-model"
+ # Non-conforming server (no attemptedModels): degraded fallback chain.
+ assert attempted_models == ["cloud:cloud-fix-model"]
# Verify request was made correctly
mock_post.assert_called_once()
@@ -1979,7 +1990,8 @@ def test_fix_error_loop_uses_cloud_when_use_cloud_true(mock_pytest, mock_cloud_f
"fixed code",
"Cloud analysis",
0.05, # cost
- "cloud-model"
+ "cloud-model",
+ ["cloud:cloud-model"], # attempted_models (issue #1086)
)
success, final_ut, final_code, attempts, cost, model = fix_error_loop(
diff --git a/tests/test_llm_invoke.py b/tests/test_llm_invoke.py
index c193e82a8..84e1fe6b3 100644
--- a/tests/test_llm_invoke.py
+++ b/tests/test_llm_invoke.py
@@ -1562,6 +1562,35 @@ def test_llm_invoke_responses_api_valid_json_parses_correctly(mock_load_models,
assert response['result'].field2 == 42
+def test_llm_invoke_responses_api_success_returns_attempted_models(mock_load_models, mock_set_llm_cache):
+ """Successful Responses API calls should expose and propagate attempted_models."""
+ model_key_name = "OPENAI_API_KEY"
+ valid_json = '{"field1": "test_value", "field2": 42}'
+
+ mock_ctx = MagicMock()
+ mock_ctx.obj = {}
+
+ with patch.dict(os.environ, {model_key_name: "fake_key_value"}):
+ mock_responses_return = create_mock_openai_responses_api_response(valid_json)
+
+ with patch('pdd.llm_invoke.litellm.responses', return_value=mock_responses_return):
+ with patch('pdd.llm_invoke.litellm.completion') as mock_completion:
+ with patch('pdd.llm_invoke._LAST_CALLBACK_DATA', {"cost": 0.0001, "input_tokens": 10, "output_tokens": 10}):
+ with patch('click.get_current_context', return_value=mock_ctx):
+ response = llm_invoke(
+ prompt="Test prompt {text}",
+ input_json={"text": "test"},
+ strength=0.5,
+ temperature=0.0,
+ verbose=False,
+ output_pydantic=SampleOutputModel
+ )
+
+ assert response['attempted_models'] == ['gpt-5-nano']
+ assert mock_ctx.obj['attempted_models'] == ['gpt-5-nano']
+ mock_completion.assert_not_called()
+
+
# --- Tests for Multi-Credential Provider (Vertex AI) ---
def test_vertex_multi_credential_no_api_key_passed(mock_set_llm_cache):
@@ -2594,6 +2623,7 @@ def test_llm_invoke_cloud_fallback_on_error():
# Should have used local fallback
assert result["result"] == "local fallback response"
+ assert result["attempted_models"] == ["test-model"]
def test_llm_invoke_insufficient_credits_no_fallback():
@@ -2657,6 +2687,125 @@ def test_llm_invoke_cloud_invocation_error_fallback():
assert result["result"] == "local fallback response"
+def test_cloud_attempts_survive_pre_loop_setup_failure():
+ """Regression (codex P2): when a CloudFallbackError records cloud attempts
+ AND the subsequent local pre-candidate-loop setup raises (e.g. corrupt
+ llm_model.csv), the escaping exception MUST carry the cloud chain in
+ ``e.attempted_models`` AND the cloud chain MUST have already been
+ propagated to ``ctx.obj['attempted_models']`` so the ``@track_cost``
+ decorator can still write a cost row for the failed command.
+ """
+ from pdd.llm_invoke import CloudFallbackError as CurrentCloudFallbackError
+
+ cloud_err = CurrentCloudFallbackError("Network error")
+ cloud_err.attempted_models = ["cloud:gemini-2.5-pro"]
+
+ with patch("pdd.llm_invoke._llm_invoke_cloud", side_effect=cloud_err):
+ # Simulate a setup failure between the cloud handler and the
+ # candidate loop. _load_model_data sits in that gap.
+ with patch(
+ "pdd.llm_invoke._load_model_data",
+ side_effect=ValueError("simulated setup failure"),
+ ):
+ with patch(
+ "pdd.llm_invoke._propagate_attempted_models_to_ctx"
+ ) as mock_propagate:
+ with patch("rich.console.Console"):
+ with pytest.raises(ValueError) as exc_info:
+ llm_invoke(
+ prompt="Test {topic}",
+ input_json={"topic": "test"},
+ use_cloud=True,
+ )
+
+ # The escaping ValueError must carry the cloud chain so track_cost can
+ # record it on the failed command's CSV row.
+ assert getattr(exc_info.value, "attempted_models", None) == [
+ "cloud:gemini-2.5-pro"
+ ]
+
+ # The cloud chain must have been propagated to ctx BEFORE the pre-loop
+ # setup raised, so track_cost can recover it from ctx.obj even if the
+ # exception path were to lose the attribute.
+ propagated_chains = [c.args[0] for c in mock_propagate.call_args_list]
+ assert ["cloud:gemini-2.5-pro"] in propagated_chains, (
+ f"Expected ctx propagation of cloud chain, got: {propagated_chains}"
+ )
+
+
+def test_propagate_attempted_models_appends_repeated_chain():
+ """Regression (codex round 4, P3): two SIBLING llm_invoke calls in one
+ Click command that both produce the same multi-model chain
+ (e.g. both report ['cloud:a','cloud:b']) must compose into the
+ chronologically correct ['cloud:a','cloud:b','cloud:a','cloud:b'].
+
+ The helper is contracted to be called AT MOST ONCE per llm_invoke
+ invocation and now performs pure concatenation across boundaries
+ (round 5), so the cross-invocation history is preserved verbatim.
+ """
+ import click
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ ctx = click.Context(click.Command("x"), obj={})
+ with ctx:
+ _propagate_attempted_models_to_ctx(['cloud:a', 'cloud:b'])
+ _propagate_attempted_models_to_ctx(['cloud:a', 'cloud:b'])
+ assert ctx.obj['attempted_models'] == [
+ 'cloud:a', 'cloud:b', 'cloud:a', 'cloud:b'
+ ]
+
+
+def test_propagate_attempted_models_appends_when_incoming_is_disjoint():
+ """Pure concatenation must compose disjoint chains from multiple
+ distinct llm_invoke calls in one Click command into one chronological
+ list.
+ """
+ import click
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ ctx = click.Context(click.Command("x"), obj={})
+ with ctx:
+ _propagate_attempted_models_to_ctx(['m1'])
+ _propagate_attempted_models_to_ctx(['m2'])
+ assert ctx.obj['attempted_models'] == ['m1', 'm2']
+
+
+def test_propagate_attempted_models_preserves_same_model_repeats():
+ """Regression (codex round 5, P2): two sibling llm_invoke calls that
+ each report a single identical model (e.g. both ['gpt-4']) must
+ compose to ['gpt-4','gpt-4'] — collapsing to ['gpt-4'] at the
+ invocation boundary would silently drop a real LLM call from the
+ cost CSV.
+ """
+ import click
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ ctx = click.Context(click.Command("x"), obj={})
+ with ctx:
+ _propagate_attempted_models_to_ctx(['gpt-4'])
+ _propagate_attempted_models_to_ctx(['gpt-4'])
+ assert ctx.obj['attempted_models'] == ['gpt-4', 'gpt-4']
+
+
+def test_propagate_attempted_models_preserves_boundary_adjacency():
+ """Regression (codex round 5, P2): when the second incoming chain
+ starts with the same model the previous chain ended with (e.g. prior
+ ended at 'cloud:b', next starts with 'cloud:b' then escalates to
+ 'local:c'), the boundary adjacency must NOT be collapsed — both
+ 'cloud:b' entries are distinct LLM calls and must appear in order.
+ """
+ import click
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ ctx = click.Context(click.Command("x"), obj={})
+ with ctx:
+ _propagate_attempted_models_to_ctx(['cloud:a', 'cloud:b'])
+ _propagate_attempted_models_to_ctx(['cloud:b', 'local:c'])
+ assert ctx.obj['attempted_models'] == [
+ 'cloud:a', 'cloud:b', 'cloud:b', 'local:c'
+ ]
+
+
# --- Tests for cloud exception classes ---
def test_cloud_fallback_error():
@@ -2719,6 +2868,98 @@ def test_cloud_enabled_detection():
assert result["cost"] == 0.001
+def test_cloud_response_preserves_full_attempted_models_chain():
+ """Issue #1086: when the cloud response includes the full attempted-model
+ chain (camelCase, snake_case, or legacy ``modelChain``), the helper must
+ propagate every entry to callers so the cost CSV can record the abandoned
+ cloud-side attempts. Falling back to ``[modelName]`` would lose history.
+ """
+ with patch("pdd.core.cloud.CloudConfig") as mock_config:
+ mock_config.get_jwt_token.return_value = "fake_token"
+ mock_config.get_endpoint_url.return_value = "https://example.com/llmInvoke"
+
+ for response_key in ("attemptedModels", "attempted_models", "modelChain"):
+ with patch("requests.post") as mock_post:
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.json.return_value = {
+ "result": "cloud result",
+ "totalCost": 0.002,
+ "modelName": "final-model",
+ response_key: ["first-try", "second-try", "final-model"],
+ }
+ mock_post.return_value = mock_response
+
+ from pdd.llm_invoke import _llm_invoke_cloud
+
+ result = _llm_invoke_cloud(
+ prompt="Test",
+ input_json={},
+ strength=0.5,
+ temperature=0.1,
+ verbose=False,
+ output_pydantic=None,
+ output_schema=None,
+ time=0.25,
+ use_batch_mode=False,
+ messages=None,
+ language=None,
+ )
+
+ assert result["attempted_models"] == [
+ "first-try", "second-try", "final-model"
+ ], f"chain not preserved for response key {response_key!r}: {result['attempted_models']!r}"
+
+
+def test_cloud_response_missing_attempted_models_warns_and_degrades(caplog):
+ """Issue #1086: the cloud /llmInvoke success contract REQUIRES
+ `attemptedModels`. A non-conforming server that omits all three accepted
+ keys (`attemptedModels`, `attempted_models`, `modelChain`) MUST trigger a
+ client-side warning and degrade to a single-entry `[modelName]` chain
+ — preserving backward compatibility but flagging the loss of cloud-side
+ fallback history.
+ """
+ with patch("pdd.core.cloud.CloudConfig") as mock_config:
+ mock_config.get_jwt_token.return_value = "fake_token"
+ mock_config.get_endpoint_url.return_value = "https://example.com/llmInvoke"
+
+ with patch("requests.post") as mock_post:
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.json.return_value = {
+ "result": "cloud result",
+ "totalCost": 0.001,
+ "modelName": "only-final",
+ # Deliberately omits attemptedModels / attempted_models / modelChain
+ }
+ mock_post.return_value = mock_response
+
+ from pdd.llm_invoke import _llm_invoke_cloud
+
+ with caplog.at_level(logging.WARNING, logger="pdd.llm_invoke"):
+ result = _llm_invoke_cloud(
+ prompt="Test",
+ input_json={},
+ strength=0.5,
+ temperature=0.1,
+ verbose=False,
+ output_pydantic=None,
+ output_schema=None,
+ time=0.25,
+ use_batch_mode=False,
+ messages=None,
+ language=None,
+ )
+
+ # Backward-compat fallback: single-entry chain from modelName.
+ assert result["attempted_models"] == ["only-final"]
+ # And the client logs a warning so non-conforming servers are visible.
+ assert any(
+ "attemptedModels" in r.message and "issue #1086" in r.message
+ for r in caplog.records
+ ), f"Expected a warning about missing attemptedModels; got: {[r.message for r in caplog.records]}"
+
+
# --- Issue #348: Auth Status Mismatch Tests ---
def test_llm_invoke_cloud_401_clears_jwt_cache():
@@ -5942,6 +6183,65 @@ def test_within_limits_calls_completion(self, mock_load_models, mock_set_llm_cac
assert result["result"] == "hello"
mock_comp.assert_called_once()
+ def test_context_window_skipped_model_not_in_attempted_models(self, mock_load_models, mock_set_llm_cache):
+ """A candidate rejected by pre-call context-window validation must NOT appear in attempted_models.
+
+ The prompt contract (llm_invoke_python.prompt 'Attempted-Model Tracking') requires that
+ only models for which an actual provider call was attempted are recorded. Candidates
+ skipped before any call (missing api_key, pre-call context-window overflow) must not be
+ appended to the chain.
+
+ Regression for PR #1087 finding: append moved from immediately after credential
+ check to immediately before litellm.completion / batch_completion / responses call.
+ """
+ all_keys = {
+ "OPENAI_API_KEY": "fake_key",
+ "ANTHROPIC_API_KEY": "fake_key",
+ "GOOGLE_API_KEY": "fake_key",
+ }
+
+ # Configure context limits per-model: gpt-5-nano has a tiny limit so the
+ # 200K-token prompt will trigger pre-call rejection. claude-3 has a huge
+ # limit so it accepts the prompt and the call actually goes through.
+ def context_limit_side_effect(model_name):
+ if "gpt-5-nano" in str(model_name):
+ return 50_000
+ return 1_000_000
+
+ successful_response = create_mock_litellm_response("ok", model_name="claude-3")
+
+ with patch.dict(os.environ, all_keys):
+ with patch.object(_llm_invoke_module.litellm, "token_counter", return_value=200_000):
+ with patch.object(
+ _llm_invoke_module,
+ "get_context_limit",
+ side_effect=context_limit_side_effect,
+ ):
+ with patch.object(
+ _llm_invoke_module.litellm,
+ "completion",
+ return_value=successful_response,
+ ) as mock_comp:
+ result = _llm_invoke_module.llm_invoke(
+ prompt="huge prompt",
+ input_json={},
+ strength=1.0,
+ time=0.0,
+ use_cloud=False,
+ )
+
+ # Exactly one provider call: the second candidate. The first was skipped
+ # before any provider call was made.
+ assert mock_comp.call_count == 1
+ chain = result["attempted_models"]
+ assert "gpt-5-nano" not in chain, (
+ f"Context-window-skipped candidate must NOT appear in attempted_models, got {chain!r}"
+ )
+ # The candidate that was actually called must be present.
+ assert any("claude-3" in m for m in chain), (
+ f"Successful candidate must appear in attempted_models, got {chain!r}"
+ )
+
# =============================================================================
# Issue #796: TypeScript code validated as Python syntax
diff --git a/tests/test_summarize_directory.py b/tests/test_summarize_directory.py
index 99f2fb912..8c86d374f 100644
--- a/tests/test_summarize_directory.py
+++ b/tests/test_summarize_directory.py
@@ -1921,6 +1921,115 @@ def test_max_workers_thread_safe_cost_accumulation(self, tmp_path, mock_load_pro
assert abs(cost - 0.5) < 1e-9, f"Expected 0.5 total cost for 5 files, got {cost}"
+ def test_serial_path_does_not_double_propagate_attempted_models(
+ self, tmp_path, mock_load_prompt_template
+ ):
+ """Regression (codex round-7 finding 1): serial path
+ (max_workers == 1) must NOT call _propagate_attempted_models_to_ctx
+ from the orchestrator — llm_invoke's own ctx propagation already
+ lands the chain on ctx.obj['attempted_models'] from the main
+ thread. A second propagation would record each entry twice
+ (e.g. 'm1;m1' for one LLM call).
+ """
+ (tmp_path / "a.py").write_text("a")
+
+ with patch('pdd.summarize_directory.llm_invoke') as mock_llm, \
+ patch('pdd.summarize_directory._propagate_attempted_models_to_ctx') as mock_propagate:
+ mock_llm.return_value = {
+ 'result': FileSummary(file_summary="S", key_exports=["x"], dependencies=["y"]),
+ 'cost': 0.1,
+ 'model_name': "m1",
+ 'attempted_models': ['m1'],
+ }
+ summarize_directory(
+ directory_path=str(tmp_path / "*.py"),
+ strength=0.5,
+ temperature=0.0,
+ max_workers=1,
+ )
+
+ assert mock_propagate.call_count == 0, (
+ "Serial path must not double-propagate; llm_invoke already "
+ "covers ctx propagation on the main thread. "
+ f"Got {mock_propagate.call_count} call(s): "
+ f"{[c.args for c in mock_propagate.call_args_list]}"
+ )
+
+ def test_worker_exception_chain_recovered_by_aggregator(
+ self, tmp_path, mock_load_prompt_template
+ ):
+ """Regression (codex round-7 finding 2): when a worker's llm_invoke
+ raises with an `attempted_models` attribute attached (set by
+ llm_invoke's round-2 pre-loop wrapper / all-candidates RuntimeError),
+ _process_single_file_logic must harvest that chain in its broad
+ except block so the threaded aggregator can still propagate it.
+ Worker threads can't reach the parent ctx, so the exception
+ attribute is the ONLY recovery path.
+ """
+ (tmp_path / "a.py").write_text("a")
+
+ exc = RuntimeError("simulated llm exhaustion")
+ exc.attempted_models = ['m1', 'm2']
+
+ with patch('pdd.summarize_directory.llm_invoke', side_effect=exc), \
+ patch('pdd.summarize_directory._propagate_attempted_models_to_ctx') as mock_propagate:
+ summarize_directory(
+ directory_path=str(tmp_path / "*.py"),
+ strength=0.5,
+ temperature=0.0,
+ max_workers=2,
+ )
+
+ # Threaded aggregator must have propagated the recovered chain.
+ assert mock_propagate.call_count == 1, (
+ f"Expected 1 propagate call from threaded aggregator, got "
+ f"{mock_propagate.call_count}"
+ )
+ propagated = mock_propagate.call_args.args[0]
+ assert propagated == ['m1', 'm2'], (
+ f"Expected exception-attached chain ['m1','m2'], got {propagated}"
+ )
+
+ def test_max_workers_propagates_attempted_models_to_main_ctx(
+ self, tmp_path, mock_load_prompt_template
+ ):
+ """Regression (PR #1087): worker threads' Click context is
+ thread-local, so when llm_invoke calls _propagate_attempted_models_to_ctx
+ from inside a worker thread it cannot reach the main thread's ctx.
+ summarize_directory must aggregate every worker's attempted_models
+ chain and propagate to the parent ctx on the main thread after the
+ executor finishes, so @track_cost records the full chain in the
+ cost CSV for parallel pdd commands.
+ """
+ for i in range(3):
+ (tmp_path / f"file{i}.py").write_text(f"content_{i}")
+
+ with patch('pdd.summarize_directory.llm_invoke') as mock_llm, \
+ patch('pdd.summarize_directory._propagate_attempted_models_to_ctx') as mock_propagate:
+ mock_llm.return_value = {
+ 'result': FileSummary(file_summary="Summary", key_exports=["x"], dependencies=["y"]),
+ 'cost': 0.1,
+ 'model_name': "gpt-4o",
+ 'attempted_models': ['gpt-4', 'gpt-4o'],
+ }
+ summarize_directory(
+ directory_path=str(tmp_path / "*.py"),
+ strength=0.5,
+ temperature=0.0,
+ max_workers=2,
+ )
+
+ # Propagation must fire exactly once on the main thread (not per-worker),
+ # with the aggregated chain across all 3 files (each contributed
+ # ['gpt-4', 'gpt-4o'] = 2 entries × 3 files = 6 entries).
+ assert mock_propagate.call_count == 1, (
+ f"Expected exactly 1 main-thread propagate call, got {mock_propagate.call_count}"
+ )
+ propagated_chain = mock_propagate.call_args.args[0]
+ assert propagated_chain == ['gpt-4', 'gpt-4o'] * 3, (
+ f"Expected ['gpt-4','gpt-4o']*3 aggregated chain, got {propagated_chain}"
+ )
+
# ---------------------------------------------------------------------------
# Real-LLM: Multi-directory CSV accumulation and cache (requires API key)
diff --git a/tests/test_track_cost.py b/tests/test_track_cost.py
index 345787a74..368e82850 100644
--- a/tests/test_track_cost.py
+++ b/tests/test_track_cost.py
@@ -1,3 +1,27 @@
+# Test plan for pdd.track_cost (spec: prompts/track_cost_python.prompt)
+# ---------------------------------------------------------------------
+# Existing tests (kept as-is) cover:
+# * CSV row appended when file exists with content (no header rewrite)
+# * CSV header written when file does not exist
+# * CSV header written when file exists but is empty (regression)
+# * output_cost path resolved via ctx.obj vs PDD_OUTPUT_COST_PATH env
+# * cost and model name extracted from legacy result tuple
+# * Short / non-tuple results -> empty cost & model
+# * Input/output file collection (single, multiple, mixed types)
+# * Exception during CSV logging surfaces via rprint
+# * Missing Click context returns command result without logging
+# * Files still tracked for core dump when the wrapped command raises
+# * PYTEST_CURRENT_TEST env var skips CSV writing
+# * _parse_cost_from_csv roundtrip with an empty pre-created CSV
+#
+# New tests appended below cover the spec additions for the
+# `attempted_models` column:
+# * Header column order matches the spec exactly
+# * Enriched result shape: dict last element supplies cost/model/attempted
+# * Enriched dict attempted_models serialized as ';'-joined chain
+# * command-scoped ctx.obj['attempted_models'] takes precedence over the result dict
+# * Empty string used in the CSV when no chain is recorded
+# * Empty command-scoped attempted_models falls back to the result dict's chain
import pytest
import unittest.mock as mock
from unittest.mock import MagicMock, mock_open, patch
@@ -5,7 +29,7 @@
import re
from datetime import datetime
from typing import Tuple
-from pdd.track_cost import track_cost
+from pdd.track_cost import track_cost, extract_cost_and_model
from pdd.agentic_sync_runner import _parse_cost_from_csv
# Sample command function to be decorated
@@ -111,11 +135,14 @@ def test_csv_row_appended_if_file_exists_with_content(mock_click_context, mock_o
mock.patch('os.path.getsize', return_value=100):
result = sample_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # The writer peeks only the header of the existing file (to detect a
+ # missing 'attempted_models' column) before opening for append, so we
+ # expect both calls.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
handle = mock_open_file()
assert not any('timestamp,model,command,cost,input_files,output_files' in call.args[0] for call in handle.write.call_args_list)
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
mock_rprint.assert_not_called()
@@ -144,13 +171,16 @@ def test_csv_header_written_if_file_exists_but_empty(mock_click_context, mock_op
mock.patch.dict(os.environ, {'PDD_OUTPUT_COST_PATH': '/tmp/cost_abc.csv'}):
result = sample_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
- mock_open_file.assert_called_once_with('/tmp/cost_abc.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/tmp/cost_abc.csv', 'a', newline='', encoding='utf-8')
handle = mock_open_file()
# Header MUST be written when file is empty
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should follow (command name is 'sync' from mock context)
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,sync,25.5,/path/to/prompt.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,sync,25.5,/path/to/prompt.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
mock_rprint.assert_not_called()
@@ -198,14 +228,17 @@ def test_output_cost_path_via_param(mock_click_context, mock_open_file, mock_rpr
result = sample_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
# Ensure that open was called with the correct path and mode
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Use a regex pattern to match the row, ignoring the specific timestamp
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -236,15 +269,17 @@ def test_output_cost_path_via_env(mock_click_context, mock_open_file, mock_rprin
with mock.patch('os.path.isfile', return_value=False):
result = sample_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
- # Ensure that open was called with the path from environment variable
- mock_open_file.assert_called_once_with('/env/path/cost.csv', 'a', newline='', encoding='utf-8')
+ # Ensure that open was called with the path from environment variable.
+ # Lock-file open (cost.csv.lock) now happens alongside; assert the
+ # cost.csv 'a' open occurred, not that it was the ONLY open call.
+ mock_open_file.assert_any_call('/env/path/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Use a regex pattern to match the row, ignoring the specific timestamp
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -274,14 +309,17 @@ def test_csv_header_written_if_file_not_exists(mock_click_context, mock_open_fil
result = sample_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
# Ensure that open was called once
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Header should be written first
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should be written
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,gpt-3,generate,25.5,/path/to/prompt.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -313,14 +351,17 @@ def train_command(ctx, input_file: str, output: str = None) -> Tuple[str, float,
result = train_command(mock_ctx, '/path/to/input.txt', output='/path/to/output')
# Ensure that open was called with the correct path
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Header should be written
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should have correct cost and model
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,bert-base,train,50.0,/path/to/input.txt,/path/to/output\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,bert-base,train,50.0,/path/to/input.txt,/path/to/output,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -353,14 +394,17 @@ def short_result_command(ctx, prompt_file: str) -> Tuple[str]:
result = short_result_command(mock_ctx, '/path/to/prompt.txt')
# Ensure that open was called
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Header should be written
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should have empty cost and model
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,,short,,/path/to/prompt.txt,\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,,short,,/path/to/prompt.txt,,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -394,14 +438,17 @@ def process_command(ctx, input_file: str, output_file: str) -> Tuple[str, float,
result = process_command(mock_ctx, '/path/to/input.txt', output_file='/path/to/output.txt')
# Ensure that open was called with the correct path
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Header should be written
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should have correct input and output files
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,custom-model,process,15.0,/path/to/input.txt,/path/to/output.txt\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,custom-model,process,15.0,/path/to/input.txt,/path/to/output.txt,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -440,14 +487,17 @@ def batch_command(ctx, input_files: list, output_files: list, output_cost: str)
)
# Ensure that open was called with the correct path
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Header should be written
- handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files\r\n')
+ handle.write.assert_any_call('timestamp,model,command,cost,input_files,output_files,attempted_models\r\n')
# Data row should have multiple input and output files separated by semicolons
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,batch-model,batch,100.0,/path/to/input1.txt;/path/to/input2.txt,/path/to/output1.txt;/path/to/output2.txt\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,batch-model,batch,100.0,/path/to/input1.txt;/path/to/input2.txt,/path/to/output1.txt;/path/to/output2.txt,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -478,8 +528,9 @@ def test_exception_in_logging(mock_click_context, mock_open_file, mock_rprint):
mock.patch('os.path.getsize', return_value=100):
result = sample_command(mock_ctx, '/path/to/prompt.txt')
- # Ensure that open was attempted
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Ensure that open was attempted for append (a prior read-open may also
+ # have been issued for the migration check; both raise IOError).
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Ensure that an error was printed using rprint
mock_rprint.assert_called_once()
@@ -516,12 +567,15 @@ def mixed_command(ctx, input_file: str, output_file: str, config: dict) -> Tuple
result = mixed_command(mock_ctx, '/path/to/input.txt', output_file='/path/to/output.txt', config={'key': 'value'})
# Ensure that open was called with the correct path
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Data row should include only string file paths
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,mixed-model,mixed,30.0,/path/to/input.txt,/path/to/output.txt\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,mixed-model,mixed,30.0,/path/to/input.txt,/path/to/output.txt,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -570,12 +624,15 @@ def non_tuple_command(ctx, prompt_file: str) -> str:
result = non_tuple_command(mock_ctx, '/path/to/prompt.txt')
# Ensure that open was called
- mock_open_file.assert_called_once_with('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
+ # Lock-file open (cost.csv.lock) now happens alongside the cost.csv
+ # append; assert the cost.csv 'a' open occurred, not that it was the
+ # ONLY open call.
+ mock_open_file.assert_any_call('/path/to/cost.csv', 'a', newline='', encoding='utf-8')
# Retrieve the file handle to check written content
handle = mock_open_file()
# Data row should have empty cost and model
- row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,,non_tuple,,/path/to/prompt.txt,\r\n')
+ row_pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,,non_tuple,,/path/to/prompt.txt,,\r\n')
assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list)
# Ensure no error was printed
@@ -822,3 +879,828 @@ def test_empty_tempfile_roundtrip_with_parse_cost(tmp_path):
# Clean up
os.unlink(cost_file.name)
+
+
+# ---------------------------------------------------------------------------
+# Tests for the spec additions: enriched result shape and attempted_models.
+# ---------------------------------------------------------------------------
+
+
+def test_header_column_order_matches_spec(mock_click_context, mock_open_file, mock_rprint):
+ """The CSV header must be exactly:
+ timestamp,model,command,cost,input_files,output_files,attempted_models
+ """
+ mock_ctx = create_mock_context(
+ 'generate',
+ {'prompt_file': '/path/to/prompt.txt', 'output_cost': '/p/c.csv'},
+ obj={'output_cost': '/p/c.csv'},
+ )
+ mock_click_context.return_value = mock_ctx
+
+ with mock.patch('os.path.isfile', return_value=False):
+ sample_command(mock_ctx, '/path/to/prompt.txt')
+
+ handle = mock_open_file()
+ handle.write.assert_any_call(
+ 'timestamp,model,command,cost,input_files,output_files,attempted_models\r\n'
+ )
+
+
+def test_enriched_result_dict_supplies_cost_and_model(
+ mock_click_context, mock_open_file, mock_rprint
+):
+ """When the last tuple element is a dict, read cost/model_name from it."""
+ @track_cost
+ def enriched_command(ctx, prompt_file: str):
+ return ('/path/to/output', {
+ 'cost': 0.42,
+ 'model_name': 'claude-opus',
+ 'attempted_models': ['claude-opus'],
+ })
+
+ mock_ctx = create_mock_context(
+ 'enriched',
+ {'prompt_file': '/path/to/prompt.txt', 'output_cost': '/p/c.csv'},
+ obj={'output_cost': '/p/c.csv'},
+ )
+ mock_click_context.return_value = mock_ctx
+
+ with mock.patch('os.path.isfile', return_value=False):
+ result = enriched_command(mock_ctx, '/path/to/prompt.txt')
+
+ handle = mock_open_file()
+ row_pattern = re.compile(
+ r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+,'
+ r'claude-opus,enriched,0\.42,/path/to/prompt\.txt,,claude-opus\r\n'
+ )
+ assert any(row_pattern.match(call.args[0]) for call in handle.write.call_args_list), \
+ f"No matching row in: {[c.args[0] for c in handle.write.call_args_list]}"
+ assert result[0] == '/path/to/output'
+
+
+def test_enriched_attempted_models_serialized_chronologically(
+ mock_click_context, mock_open_file, mock_rprint
+):
+ """attempted_models chain is serialized as ';'-joined in chronological order."""
+ @track_cost
+ def chain_command(ctx, prompt_file: str):
+ return ('/path/to/output', {
+ 'cost': 0.1,
+ 'model_name': 'final-model',
+ 'attempted_models': ['first-try', 'second-try', 'final-model'],
+ })
+
+ mock_ctx = create_mock_context(
+ 'chain',
+ {'prompt_file': '/path/to/prompt.txt', 'output_cost': '/p/c.csv'},
+ obj={'output_cost': '/p/c.csv'},
+ )
+ mock_click_context.return_value = mock_ctx
+
+ with mock.patch('os.path.isfile', return_value=False):
+ chain_command(mock_ctx, '/path/to/prompt.txt')
+
+ handle = mock_open_file()
+ writes = [c.args[0] for c in handle.write.call_args_list]
+ assert any('first-try;second-try;final-model\r\n' in w for w in writes), \
+ f"Chain not serialized chronologically. writes={writes}"
+
+
+def test_command_scoped_attempted_models_takes_precedence(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """The chain populated during this command wins over the result dict."""
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ csv_path = tmp_path / "cost.csv"
+ shared_obj = {'output_cost': str(csv_path)}
+
+ @track_cost
+ def both_command(ctx, prompt_file: str):
+ _propagate_attempted_models_to_ctx(['ctx-A', 'ctx-B'])
+ return ('/path/to/output', {
+ 'cost': 0.5,
+ 'model_name': 'final',
+ 'attempted_models': ['from-result-only'],
+ })
+
+ mock_ctx = MagicMock()
+ mock_ctx.command.name = 'both'
+ mock_ctx.params = {'prompt_file': '/path/to/prompt.txt'}
+ mock_ctx.obj = shared_obj
+ mock_click_context.return_value = mock_ctx
+
+ both_command(mock_ctx, '/path/to/prompt.txt')
+
+ import csv as _csv
+ with open(csv_path, newline='', encoding='utf-8') as fp:
+ rows = list(_csv.DictReader(fp))
+ assert rows[-1]['attempted_models'] == 'ctx-A;ctx-B'
+ assert rows[-1]['attempted_models'] != 'from-result-only'
+ assert 'attempted_models' not in shared_obj
+
+
+def test_attempted_models_empty_when_no_chain(
+ mock_click_context, mock_open_file, mock_rprint
+):
+ """Empty string used in the CSV when neither ctx.obj nor result supplies a chain."""
+ @track_cost
+ def no_chain_command(ctx, prompt_file: str):
+ # Legacy shape, no attempted_models anywhere.
+ return ('/path/to/output', 0.05, 'gpt-4')
+
+ mock_ctx = create_mock_context(
+ 'no_chain',
+ {'prompt_file': '/path/to/prompt.txt', 'output_cost': '/p/c.csv'},
+ obj={'output_cost': '/p/c.csv'},
+ )
+ mock_click_context.return_value = mock_ctx
+
+ with mock.patch('os.path.isfile', return_value=False):
+ no_chain_command(mock_ctx, '/path/to/prompt.txt')
+
+ handle = mock_open_file()
+ writes = [c.args[0] for c in handle.write.call_args_list]
+ # Expect the row to END with `,\r\n` for the empty attempted_models cell.
+ data_rows = [w for w in writes if 'no_chain' in w]
+ assert data_rows, f"No data row found. writes={writes}"
+ assert data_rows[0].endswith(',\r\n'), \
+ f"Row should end with empty attempted_models, got: {data_rows[0]!r}"
+
+
+def test_empty_command_attempted_models_falls_back_to_result_dict(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """Precedence: the command-scoped chain must be non-empty to win."""
+ csv_path = tmp_path / "cost.csv"
+ shared_obj = {'output_cost': str(csv_path), 'attempted_models': []}
+
+ @track_cost
+ def fallback_command(ctx, prompt_file: str):
+ return ('/path/to/output', {
+ 'cost': 0.2,
+ 'model_name': 'm',
+ 'attempted_models': ['from-result'],
+ })
+
+ mock_ctx = MagicMock()
+ mock_ctx.command.name = 'fallback'
+ mock_ctx.params = {'prompt_file': '/path/to/prompt.txt'}
+ mock_ctx.obj = shared_obj
+ mock_click_context.return_value = mock_ctx
+
+ fallback_command(mock_ctx, '/path/to/prompt.txt')
+
+ import csv as _csv
+ with open(csv_path, newline='', encoding='utf-8') as fp:
+ rows = list(_csv.DictReader(fp))
+ assert rows[-1]['attempted_models'] == 'from-result'
+ assert shared_obj['attempted_models'] == []
+
+
+def test_extract_cost_and_model_enriched_shape():
+ """extract_cost_and_model reads cost/model/attempted from a dict last element."""
+ result = ('out', {
+ 'cost': 0.33,
+ 'model_name': 'modelX',
+ 'attempted_models': ['a', 'b'],
+ })
+ cost, model, attempted = extract_cost_and_model(result)
+ assert cost == 0.33
+ assert model == 'modelX'
+ assert attempted == ['a', 'b']
+
+
+def test_extract_cost_and_model_legacy_shape():
+ """Legacy: (..., cost, model_name) returns no attempted_models chain."""
+ result = ('out', 0.07, 'gpt-4')
+ cost, model, attempted = extract_cost_and_model(result)
+ assert cost == 0.07
+ assert model == 'gpt-4'
+ assert attempted == []
+
+
+def test_extract_cost_and_model_non_tuple_returns_empty():
+ """Non-tuple result returns the empty-string sentinel for cost & model."""
+ cost, model, attempted = extract_cost_and_model('just a string')
+ assert cost == ''
+ assert model == ''
+ assert attempted == []
+
+
+def test_existing_csv_without_attempted_models_is_migrated(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """An existing 6-column cost.csv should be migrated to include the new
+ 'attempted_models' header so subsequent rows are addressable by name via
+ csv.DictReader instead of leaking into ``None``.
+ """
+ csv_path = tmp_path / "cost.csv"
+ # Pre-existing CSV with the old six-column schema.
+ csv_path.write_text(
+ "timestamp,model,command,cost,input_files,output_files\n"
+ "2026-01-01T00:00:00.000,gpt-4,generate,1.0,a.txt,b.txt\n",
+ encoding="utf-8",
+ )
+
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ shared_obj = {'output_cost': str(csv_path)}
+ mock_ctx = MagicMock()
+ mock_ctx.command.name = 'generate'
+ mock_ctx.params = {
+ 'prompt_file': '/path/to/prompt.txt',
+ 'output_cost': str(csv_path),
+ 'output': '/path/to/output',
+ }
+ mock_ctx.obj = shared_obj
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def migrating_command(ctx, prompt_file: str, output: str):
+ _propagate_attempted_models_to_ctx(['vertex_ai/gemini', 'deepseek/chat'])
+ return (output, 25.5, 'deepseek/chat')
+
+ migrating_command(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
+
+ content = csv_path.read_text(encoding='utf-8')
+ lines = content.strip().splitlines()
+ header = lines[0].split(',')
+ assert header[:6] == [
+ 'timestamp', 'model', 'command', 'cost', 'input_files', 'output_files'
+ ]
+ assert header[-1] == 'attempted_models'
+
+ # csv.DictReader should now expose the new column under its proper name
+ # (instead of dumping the extra value into ``None``).
+ import csv as _csv
+ with open(csv_path, newline='', encoding='utf-8') as fp:
+ rows = list(_csv.DictReader(fp))
+ assert rows[-1]['attempted_models'] == 'vertex_ai/gemini;deepseek/chat'
+ assert None not in rows[-1]
+
+ mock_rprint.assert_not_called()
+
+
+def test_track_cost_captures_chain_when_ctx_obj_created_in_command(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """Regression (codex round 4, P2): when a @track_cost command starts
+ with ctx.obj is None and the wrapped func promotes it inside the call
+ (e.g. via ctx.ensure_object(dict)), the attempted_models chain that
+ llm_invoke populates on the newly-created ctx.obj must still land in
+ the CSV row. Previously the post-command capture was gated on
+ attempted_models_scoped, which is False when ctx.obj is None at entry,
+ so the captured chain stayed empty and the row's attempted_models
+ column was blank even though a chain had been recorded.
+ """
+ csv_path = tmp_path / "cost.csv"
+
+ mock_ctx = MagicMock()
+ mock_ctx.command.name = 'generate'
+ mock_ctx.params = {
+ 'prompt_file': '/path/to/prompt.txt',
+ 'output_cost': str(csv_path),
+ 'output': '/path/to/output',
+ }
+ # Entry state: ctx.obj is None. The snapshot/restore guard in
+ # @track_cost will NOT fire because hasattr(None, '__setitem__') is
+ # False. Capture must still happen post-command.
+ mock_ctx.obj = None
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def cmd(ctx, prompt_file, output):
+ # Simulate the command (or its callees, e.g. llm_invoke) promoting
+ # ctx.obj inside the call.
+ mock_ctx.obj = {
+ 'output_cost': str(csv_path),
+ 'attempted_models': ['m1'],
+ }
+ return (output, 1.0, 'm1')
+
+ cmd(mock_ctx, '/path/to/prompt.txt', output='/path/to/output')
+
+ import csv as _csv
+ with open(csv_path, newline='', encoding='utf-8') as fp:
+ rows = list(_csv.DictReader(fp))
+ assert rows, "Expected at least one row in cost CSV"
+ assert rows[-1]['attempted_models'] == 'm1', (
+ f"Expected captured chain 'm1', got {rows[-1]['attempted_models']!r}"
+ )
+
+ mock_rprint.assert_not_called()
+
+
+def test_attempted_models_scoped_to_single_command_invocation(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """Regression for issue #1086: when two @track_cost commands share a
+ ctx.obj (chained / batched CLI run), the second command's CSV row must
+ contain ONLY its own attempted models — not those accumulated during a
+ prior command's execution. Previously _propagate_attempted_models_to_ctx
+ appended to a shared ctx.obj entry that track_cost never reset, so the
+ second row leaked the first command's fallback chain.
+ """
+ from pdd.llm_invoke import _propagate_attempted_models_to_ctx
+
+ csv_path = tmp_path / "cost.csv"
+ shared_obj = {'output_cost': str(csv_path)}
+
+ @track_cost
+ def cmd_one(ctx, prompt_file: str):
+ _propagate_attempted_models_to_ctx(['m1'])
+ return ('/path/to/out1', 0.1, 'm1')
+
+ @track_cost
+ def cmd_two(ctx, prompt_file: str):
+ _propagate_attempted_models_to_ctx(['m2'])
+ return ('/path/to/out2', 0.2, 'm2')
+
+ # Use a SHARED real dict for ctx.obj across both commands so the
+ # production aliasing pattern is faithfully reproduced.
+ mock_ctx_one = MagicMock()
+ mock_ctx_one.command.name = 'cmd_one'
+ mock_ctx_one.params = {'prompt_file': '/p.txt'}
+ mock_ctx_one.obj = shared_obj
+
+ mock_ctx_two = MagicMock()
+ mock_ctx_two.command.name = 'cmd_two'
+ mock_ctx_two.params = {'prompt_file': '/p.txt'}
+ mock_ctx_two.obj = shared_obj
+
+ mock_click_context.return_value = mock_ctx_one
+ cmd_one(mock_ctx_one, '/p.txt')
+
+ mock_click_context.return_value = mock_ctx_two
+ cmd_two(mock_ctx_two, '/p.txt')
+
+ import csv as _csv
+ with open(csv_path, newline='', encoding='utf-8') as fp:
+ rows = list(_csv.DictReader(fp))
+ rows_by_command = {row['command']: row for row in rows}
+ assert 'cmd_one' in rows_by_command, f"missing cmd_one row: {rows}"
+ assert 'cmd_two' in rows_by_command, f"missing cmd_two row: {rows}"
+
+ assert rows_by_command['cmd_one']['attempted_models'] == 'm1'
+ # The critical assertion: cmd_two must NOT inherit m1 from the prior
+ # command's scope. It must contain only its own attempt.
+ assert rows_by_command['cmd_two']['attempted_models'] == 'm2', (
+ f"cmd_two leaked attempted_models from cmd_one: "
+ f"{rows_by_command['cmd_two']['attempted_models']!r}"
+ )
+ # And the shared ctx.obj should have been restored to a clean state
+ # (no 'attempted_models' key persists after the last @track_cost wrapper
+ # exits, because there was no prior value to restore to).
+ assert 'attempted_models' not in shared_obj
+
+
+import sys
+
+
+def test_migration_serialized_by_lock(
+ mock_click_context, mock_rprint, tmp_path
+):
+ """Regression (codex round-7 finding 3, 3rd-pass review): concurrent
+ @track_cost invocations against a legacy 6-column cost.csv must not
+ lose rows to the migrate+append race. The portable atomic lock
+ (O_CREAT|O_EXCL) serializes both the migration and the subsequent
+ append on POSIX and Windows alike — no fcntl fallback.
+
+ This is a structural smoke test using threads (which serialize via
+ the GIL during the critical section) — full multi-process race
+ coverage would require subprocess fixture machinery. The
+ lock acquire/release code path is exercised end-to-end, proving
+ the lock is opened/closed correctly and the migration completes
+ under the lock.
+ """
+ import threading
+
+ csv_path = tmp_path / "cost.csv"
+ # Pre-existing 6-column CSV (legacy schema).
+ csv_path.write_text(
+ "timestamp,model,command,cost,input_files,output_files\n"
+ "2026-01-01T00:00:00.000,gpt-4,sync,0.01,a.py,b.py\n",
+ encoding="utf-8",
+ )
+
+ def _do_append(model_name):
+ # Each thread gets its own mock ctx so command-name doesn't collide.
+ ctx = MagicMock()
+ ctx.command.name = 'generate'
+ ctx.params = {
+ 'prompt_file': '/p.txt',
+ 'output_cost': str(csv_path),
+ 'output': '/o.txt',
+ }
+ ctx.obj = {'output_cost': str(csv_path)}
+
+ @track_cost
+ def cmd(ctx_arg, prompt_file, output):
+ return (output, 0.5, model_name)
+
+ # Each thread needs its own get_current_context patch — patch the
+ # underlying module-level function so all threads see the same patch
+ # but it returns the per-thread mock ctx via a thread-local sentinel.
+ # Simpler: use a per-thread patch context manager.
+ with patch('pdd.track_cost.click.get_current_context', return_value=ctx):
+ cmd(ctx, '/p.txt', output='/o.txt')
+
+ threads = [
+ threading.Thread(target=_do_append, args=('m1',)),
+ threading.Thread(target=_do_append, args=('m2',)),
+ ]
+ for t in threads:
+ t.start()
+ for t in threads:
+ t.join()
+
+ content = csv_path.read_text(encoding='utf-8')
+ lines = [line for line in content.splitlines() if line.strip()]
+ # Header (migrated, 7-column) + 1 original legacy row + 2 new rows.
+ assert len(lines) == 4, (
+ f"expected 4 lines (1 header + 1 legacy + 2 new), got "
+ f"{len(lines)}: {lines!r}"
+ )
+ assert 'attempted_models' in lines[0], (
+ f"header must be migrated to 7-column, got: {lines[0]!r}"
+ )
+ # Both new rows must be present (no lost-update).
+ joined = '\n'.join(lines)
+ assert ',m1,' in joined and ',m2,' in joined, (
+ f"both appended rows must survive the race; got:\n{joined}"
+ )
+
+ mock_rprint.assert_not_called()
+
+
+def test_acquire_atomic_lock_steals_lock_with_dead_pid(tmp_path):
+ """Regression (4th-pass review, finding 2): staleness is determined by
+ PID liveness, not by mtime. A lock whose holder PID is no longer alive
+ must be reclaimed immediately — the previous mtime-only logic would
+ either steal too eagerly (deleting a legitimately slow holder's lock)
+ or wait the full mtime threshold even for dead processes.
+ """
+ import os
+ from pdd.track_cost import (
+ _acquire_atomic_lock,
+ _release_atomic_lock,
+ )
+
+ lock_path = str(tmp_path / "cost.csv.lock")
+
+ # Plant a lock whose recorded PID is definitely dead — pick a value
+ # above MAX_PID on Linux/macOS so os.kill(pid, 0) raises immediately.
+ dead_pid = 999_999_999
+ with open(lock_path, 'w', encoding='utf-8') as f:
+ f.write(f"{dead_pid}-stalefakenoncehex")
+
+ handle, contended = _acquire_atomic_lock(lock_path)
+ try:
+ assert handle is not None, (
+ "lock with dead PID must be reclaimed immediately; "
+ "_acquire_atomic_lock returned None"
+ )
+ fd, nonce = handle
+ # The nonce we wrote must NOT be the planted dead-PID nonce.
+ assert "stalefakenoncehex" not in nonce, (
+ "fresh nonce must replace the dead holder's identifier"
+ )
+ finally:
+ _release_atomic_lock(handle, lock_path)
+
+ assert not os.path.exists(lock_path), (
+ "after release, the lock file must be unlinked"
+ )
+
+
+def test_acquire_atomic_lock_does_not_steal_live_pid(tmp_path):
+ """Regression (4th-pass review, finding 2): a lock held by a LIVE
+ PID must NOT be stolen even after the mtime ``_LOCK_STALE_SECONDS``
+ threshold. The previous design treated any lock older than 60s as
+ stale, which would clobber a legitimate slow migration.
+ """
+ import os
+ import time
+ from pdd.track_cost import _acquire_atomic_lock, _LOCK_STALE_SECONDS
+
+ lock_path = str(tmp_path / "cost.csv.lock")
+
+ # Plant a lock whose recorded PID IS us — definitely alive — and
+ # backdate the mtime well past the safety-net threshold.
+ with open(lock_path, 'w', encoding='utf-8') as f:
+ f.write(f"{os.getpid()}-livepidnoncehex")
+ very_old = time.time() - (_LOCK_STALE_SECONDS * 5)
+ os.utime(lock_path, (very_old, very_old))
+
+ # The acquire MUST exhaust retries (because the holder is alive) and
+ # report contention. We use a short manual retry by patching the
+ # constant via monkeypatching the module — but simplest: just call
+ # and check it returned None + contended=True quickly. The 30s
+ # default budget would make this slow, so monkey-patch the retry
+ # cap down to keep CI fast.
+ import pdd.track_cost as _tc
+ saved_max = _tc._LOCK_RETRY_MAX
+ _tc._LOCK_RETRY_MAX = 5 # 0.5s total — fast for a "must not steal" assert
+ try:
+ handle, contended = _acquire_atomic_lock(lock_path)
+ finally:
+ _tc._LOCK_RETRY_MAX = saved_max
+
+ try:
+ assert handle is None, (
+ "lock held by a LIVE PID must NOT be stolen regardless of "
+ "mtime; _acquire_atomic_lock returned a handle anyway"
+ )
+ assert contended is True, (
+ "contention must be reported when the live holder kept us out"
+ )
+ assert os.path.exists(lock_path), (
+ "the live holder's lock file must remain untouched"
+ )
+ finally:
+ # Clean up our planted file so tmp_path teardown succeeds.
+ try:
+ os.unlink(lock_path)
+ except OSError:
+ pass
+
+
+def test_release_atomic_lock_does_not_unlink_after_steal(tmp_path):
+ """Regression (4th-pass review, finding 3): if our lock was stolen by
+ another process while we held it, our release must NOT delete the
+ new owner's lock file. The nonce check on release prevents this race.
+ """
+ import os
+ from pdd.track_cost import _acquire_atomic_lock, _release_atomic_lock
+
+ lock_path = str(tmp_path / "cost.csv.lock")
+
+ # Acquire normally; capture the handle (fd, nonce).
+ handle, _ = _acquire_atomic_lock(lock_path)
+ assert handle is not None, "initial acquire must succeed"
+
+ # Simulate a stale-steal: another owner overwrites the lock file with
+ # their own nonce.
+ other_owner_content = "12345-otheownernoncehex"
+ with open(lock_path, 'w', encoding='utf-8') as f:
+ f.write(other_owner_content)
+
+ # Now release — we hold an fd to the OLD file, and the disk content
+ # has a different nonce. The release MUST detect this mismatch and
+ # leave the lock file alone.
+ _release_atomic_lock(handle, lock_path)
+
+ assert os.path.exists(lock_path), (
+ "release after stale-steal must not unlink the new owner's lock"
+ )
+ with open(lock_path, 'r', encoding='utf-8') as f:
+ remaining = f.read()
+ assert remaining == other_owner_content, (
+ "release after stale-steal must not modify the new owner's content; "
+ f"got {remaining!r}"
+ )
+
+ # Cleanup for tmp_path teardown.
+ os.unlink(lock_path)
+
+
+def test_migration_skips_write_on_lock_failure(mock_click_context, mock_rprint, tmp_path):
+ """Regression (5th-pass review, finding 1): when migration is needed
+ but the lock can't be acquired (contended timeout or unsupported
+ filesystem), the write MUST be skipped — not fall through unlocked
+ while the original holder is still in the migration critical section.
+ Falling through reopens the lost-row race.
+ """
+ import csv as _csv
+ csv_path = tmp_path / "cost.csv"
+ # Pre-existing 6-column legacy CSV — migration will trigger.
+ csv_path.write_text(
+ "timestamp,model,command,cost,input_files,output_files\n"
+ "2026-01-01T00:00:00.000,gpt-4,sync,0.01,a.py,b.py\n",
+ encoding="utf-8",
+ )
+ legacy_content = csv_path.read_text(encoding="utf-8")
+
+ mock_ctx = create_mock_context(
+ 'sync',
+ {'prompt_file': '/p.txt', 'output_cost': str(csv_path), 'output': '/o.txt'},
+ obj={'output_cost': str(csv_path)}
+ )
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def cmd(ctx_arg, prompt_file, output):
+ return (output, 0.5, 'm1')
+
+ # Force lock acquisition to fail by patching _acquire_atomic_lock.
+ with patch('pdd.track_cost._acquire_atomic_lock', return_value=(None, True)):
+ cmd(mock_ctx, '/p.txt', output='/o.txt')
+
+ # The CSV must be byte-identical to the legacy content — NOT migrated,
+ # NOT appended unlocked. The lost-row race only matters if we wrote;
+ # by skipping, we leave the file safe for the real holder.
+ after = csv_path.read_text(encoding="utf-8")
+ assert after == legacy_content, (
+ "migration with failed lock must skip the write entirely; got:\n" + after
+ )
+ # A loud warning must have fired so the user knows the row was lost.
+ warn_calls = [c.args[0] for c in mock_rprint.call_args_list]
+ assert any('could not acquire CSV lock for legacy migration' in w for w in warn_calls), (
+ f"Expected migration-skip warning; got rprint calls: {warn_calls!r}"
+ )
+
+
+def test_acquire_atomic_lock_retries_on_partial_nonce_write(tmp_path, monkeypatch):
+ """Regression (5th-pass review, finding 2): when nonce write fails or
+ is partial, the acquire must close + unlink + retry rather than
+ returning a malformed lock file that future processes wait 10 minutes
+ on (the mtime stale threshold for unparseable nonces).
+ """
+ import os as _os
+ from pdd import track_cost as _tc
+
+ lock_path = str(tmp_path / "cost.csv.lock")
+
+ # Patch os.write to fail the FIRST call only — second call (after
+ # cleanup + retry) succeeds normally.
+ real_write = _os.write
+ call_count = {'n': 0}
+
+ def flaky_write(fd, data):
+ call_count['n'] += 1
+ if call_count['n'] == 1:
+ # Simulate partial write — returns fewer bytes than data.
+ return 0
+ return real_write(fd, data)
+
+ monkeypatch.setattr(_tc.os, 'write', flaky_write)
+
+ handle, contended = _tc._acquire_atomic_lock(lock_path)
+ try:
+ assert handle is not None, (
+ "acquire must retry after partial-write failure; got None"
+ )
+ # After the first failed write, the file must have been cleaned
+ # up (no malformed lock left behind). The second attempt succeeds
+ # and the file now contains our valid nonce.
+ with open(lock_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+ assert '-' in content and content.startswith(str(os.getpid())), (
+ f"after retry the lock file must contain our nonce; got {content!r}"
+ )
+ finally:
+ _tc._release_atomic_lock(handle, lock_path)
+
+ assert not os.path.exists(lock_path), (
+ "after release, the lock file must be unlinked"
+ )
+
+
+def test_release_atomic_lock_unlinks_malformed_lock(tmp_path):
+ """Regression (5th-pass review, finding 3): when the lock file
+ exists but has no parseable nonce (e.g. truncated externally),
+ release MUST still unlink the file. Leaving it behind would block
+ every subsequent PDD invocation for the mtime stale window.
+ """
+ import os as _os
+ from pdd import track_cost as _tc
+
+ lock_path = str(tmp_path / "cost.csv.lock")
+
+ # Acquire normally.
+ handle, _ = _tc._acquire_atomic_lock(lock_path)
+ assert handle is not None, "initial acquire must succeed"
+
+ # Truncate the file content so _read_lock_owner returns None.
+ with open(lock_path, 'w', encoding='utf-8') as f:
+ f.write('')
+
+ # Release with a malformed (empty) lock file. Previously the release
+ # would silently return; now it unlinks because we hold the
+ # O_EXCL-acquired fd.
+ _tc._release_atomic_lock(handle, lock_path)
+
+ assert not _os.path.exists(lock_path), (
+ "release must unlink an unparseable lock that we created via "
+ "O_EXCL; otherwise future PDD invocations are blocked for the "
+ "mtime stale window."
+ )
+
+
+def test_peek_survives_malformed_csv_first_row(mock_click_context, mock_rprint, tmp_path):
+ """Regression (6th-pass review, finding 1): a malformed cost.csv first
+ row (csv.Error during the pre-lock peek) must NOT drop the cost row.
+ Previously only OSError was caught, so csv.Error bubbled up to the
+ outer except and the command silently lost its cost record.
+ """
+ csv_path = tmp_path / "cost.csv"
+ # Malformed CSV: unterminated quoted field — csv.reader raises csv.Error.
+ csv_path.write_bytes(b'timestamp,model,"command\n')
+
+ mock_ctx = create_mock_context(
+ 'sync',
+ {'prompt_file': '/p.txt', 'output_cost': str(csv_path), 'output': '/o.txt'},
+ obj={'output_cost': str(csv_path)}
+ )
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def cmd(ctx_arg, prompt_file, output):
+ return (output, 0.5, 'm1')
+
+ cmd(mock_ctx, '/p.txt', output='/o.txt')
+
+ # The new cost row must be appended despite the malformed prefix.
+ content = csv_path.read_bytes()
+ assert b',m1,' in content, (
+ "malformed CSV must not drop the cost row; final content:\n" + repr(content)
+ )
+ # No error rprint should fire (peek failure is non-fatal).
+ error_calls = [c.args[0] for c in mock_rprint.call_args_list
+ if 'Error tracking cost' in c.args[0]]
+ assert not error_calls, (
+ f"peek failure must not surface as 'Error tracking cost'; got {error_calls!r}"
+ )
+
+
+def test_peek_survives_non_utf8_csv(mock_click_context, mock_rprint, tmp_path):
+ """Regression (6th-pass review, finding 2): a cost.csv with invalid
+ UTF-8 bytes must NOT drop the new cost row. Previously the peek's
+ `encoding='utf-8'` open raised UnicodeDecodeError, which only OSError
+ caught — so the row was lost via the outer exception handler.
+ """
+ csv_path = tmp_path / "cost.csv"
+ # Header bytes followed by a non-UTF-8 sequence (lone 0x80 continuation).
+ csv_path.write_bytes(
+ b"timestamp,model,command,cost,input_files,output_files\n"
+ b"2026-01-01,gpt-4,sync,0.01,a.py,\x80\x81bad.py\n"
+ )
+
+ mock_ctx = create_mock_context(
+ 'sync',
+ {'prompt_file': '/p.txt', 'output_cost': str(csv_path), 'output': '/o.txt'},
+ obj={'output_cost': str(csv_path)}
+ )
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def cmd(ctx_arg, prompt_file, output):
+ return (output, 0.5, 'm-utf8-fallback')
+
+ cmd(mock_ctx, '/p.txt', output='/o.txt')
+
+ content = csv_path.read_bytes()
+ assert b'm-utf8-fallback' in content, (
+ "non-UTF-8 CSV must not drop the cost row; final content:\n" + repr(content)
+ )
+ error_calls = [c.args[0] for c in mock_rprint.call_args_list
+ if 'Error tracking cost' in c.args[0]]
+ assert not error_calls, (
+ f"UnicodeDecodeError in peek must not surface as 'Error tracking cost'; got {error_calls!r}"
+ )
+
+
+def test_unsupported_fs_lock_attempts_unlocked_migration(mock_click_context, mock_rprint, tmp_path):
+ """Regression (6th-pass review, finding 3): when migration is needed
+ AND the filesystem doesn't support O_EXCL (unsupported lock, not
+ contended), the previous behavior was to skip the cost row
+ indefinitely until manual migration. The new policy attempts
+ unlocked migration with a documented-tradeoff warning, so
+ single-process use on such filesystems still works.
+ """
+ csv_path = tmp_path / "cost.csv"
+ # Legacy 6-column CSV.
+ csv_path.write_text(
+ "timestamp,model,command,cost,input_files,output_files\n"
+ "2026-01-01T00:00:00.000,gpt-4,sync,0.01,a.py,b.py\n",
+ encoding="utf-8",
+ )
+
+ mock_ctx = create_mock_context(
+ 'sync',
+ {'prompt_file': '/p.txt', 'output_cost': str(csv_path), 'output': '/o.txt'},
+ obj={'output_cost': str(csv_path)}
+ )
+ mock_click_context.return_value = mock_ctx
+
+ @track_cost
+ def cmd(ctx_arg, prompt_file, output):
+ return (output, 0.5, 'm-unsupported-fs')
+
+ # Simulate unsupported filesystem: acquire returns (None, False).
+ with patch('pdd.track_cost._acquire_atomic_lock', return_value=(None, False)):
+ cmd(mock_ctx, '/p.txt', output='/o.txt')
+
+ # CSV must be migrated AND the new row appended (single-process safe).
+ content = csv_path.read_text(encoding="utf-8")
+ assert 'attempted_models' in content.splitlines()[0], (
+ "header must be migrated even on unsupported FS; got:\n" + content
+ )
+ assert 'm-unsupported-fs' in content, (
+ "cost row must be appended on unsupported FS; got:\n" + content
+ )
+ warn_calls = [c.args[0] for c in mock_rprint.call_args_list]
+ assert any('lock not supported on this filesystem' in w for w in warn_calls), (
+ f"unsupported-FS tradeoff must be documented via warning; got rprint: {warn_calls!r}"
+ )