diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml index 5757a69cc..c2670915f 100644 --- a/.github/workflows/run-eval.yml +++ b/.github/workflows/run-eval.yml @@ -14,6 +14,7 @@ on: - swtbench - commit0 - swebenchmultimodal + - terminalbench sdk_ref: description: SDK commit/ref to evaluate required: true diff --git a/AGENTS.md b/AGENTS.md index 0206a51d1..7358fba15 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -100,4 +100,10 @@ When converting between OpenHands format and benchmark-specific formats: - Handle missing/optional fields gracefully - Log conversion warnings for debugging - Validate output format before evaluation + +# Terminal-Bench Notes +- Harbor's installable package is `harbor` (not `harbor-bench`). +- The Harbor dataset name used in CI is `terminal-bench@2.0`. +- For CI smoke tests, pass `--n-limit ` to `terminalbench-infer` so Harbor only runs the requested subset. + diff --git a/benchmarks/terminalbench/README.md b/benchmarks/terminalbench/README.md index 7b5abba36..bd6026ca6 100644 --- a/benchmarks/terminalbench/README.md +++ b/benchmarks/terminalbench/README.md @@ -15,9 +15,9 @@ Terminal-Bench evaluates how well AI agents can handle real-world, end-to-end ta 1. **Install Harbor**: Harbor is the official harness for running Terminal-Bench 2.0. ```bash -pip install harbor-bench +pip install harbor # or -uv pip install harbor-bench +uv pip install harbor ``` 2. **Docker**: Harbor requires Docker to be installed and running. @@ -43,6 +43,9 @@ uv run terminalbench-infer .llm_config/claude.json --select tasks.txt # Run with specific dataset version uv run terminalbench-infer .llm_config/claude.json --dataset terminal-bench@2.0 +# Limit the run to 5 tasks (useful for CI smoke tests) +uv run terminalbench-infer .llm_config/claude.json --n-limit 5 + # Run with multiple workers uv run terminalbench-infer .llm_config/claude.json --num-workers 4 ``` diff --git a/benchmarks/terminalbench/config.py b/benchmarks/terminalbench/config.py index a6fb2d31c..35748c0f4 100644 --- a/benchmarks/terminalbench/config.py +++ b/benchmarks/terminalbench/config.py @@ -2,7 +2,7 @@ # Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { - "dataset": "terminal-bench-2", + "dataset": "terminal-bench@2.0", "output_dir": "./evaluation_outputs", "num_workers": 1, } diff --git a/benchmarks/terminalbench/run_infer.py b/benchmarks/terminalbench/run_infer.py index d222d35b2..a01c4006d 100644 --- a/benchmarks/terminalbench/run_infer.py +++ b/benchmarks/terminalbench/run_infer.py @@ -5,7 +5,7 @@ with the standard evaluation pipeline. Usage: - uv run terminalbench-infer --dataset terminal-bench@head + uv run terminalbench-infer --dataset terminal-bench@2.0 """ import argparse @@ -51,15 +51,17 @@ def run_harbor_evaluation( output_dir: str, num_workers: int = 1, task_ids: list[str] | None = None, + n_limit: int | None = None, ) -> Path: """Run harbor evaluation with openhands-sdk agent. Args: llm: LLM configuration for the agent. - dataset: Harbor dataset name (e.g., terminal-bench@head). + dataset: Harbor dataset name (e.g., terminal-bench@2.0). output_dir: Directory to store output files. num_workers: Number of parallel workers. task_ids: Optional list of specific task IDs to run. + n_limit: Optional maximum number of dataset tasks to run. Returns: Path to the harbor output directory. @@ -101,6 +103,9 @@ def run_harbor_evaluation( for task_id in task_ids: cmd.extend(["--task-name", task_id]) + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) + logger.info(f"Running harbor command: {' '.join(cmd)}") logger.info(f"Output directory: {harbor_output_dir}") @@ -122,7 +127,7 @@ def run_harbor_evaluation( except FileNotFoundError: raise RuntimeError( - "Harbor CLI not found. Please install harbor: pip install harbor-bench" + "Harbor CLI not found. Please install harbor: pip install harbor" ) return harbor_output_dir @@ -246,8 +251,11 @@ def convert_harbor_to_eval_output( if not results and not errors: raise RuntimeError(f"No trials processed from {harbor_output_dir}") - if not results and errors: - raise RuntimeError(f"All {len(errors)} trials failed from {harbor_output_dir}") + if not results: + logger.warning( + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" + ) # Write results to output.jsonl with open(eval_output_path, "w") as f: @@ -300,7 +308,7 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="Harbor dataset name (e.g., terminal-bench@head, terminal-bench@2.0)", + help="Harbor dataset name (e.g., terminal-bench@2.0)", ) parser.add_argument( "--output-dir", @@ -314,6 +322,11 @@ def main() -> None: default=INFER_DEFAULTS["num_workers"], help="Number of parallel workers", ) + parser.add_argument( + "--n-limit", + type=int, + help="Maximum number of dataset tasks to run after Harbor filtering", + ) parser.add_argument( "--select", type=str, @@ -352,9 +365,9 @@ def main() -> None: if not args.skip_harbor and not check_harbor_installed(): logger.error( "Harbor CLI is not installed. Please install it:\n" - " pip install harbor-bench\n" + " pip install harbor\n" " # or\n" - " uv pip install harbor-bench" + " uv pip install harbor" ) sys.exit(1) @@ -404,6 +417,7 @@ def main() -> None: output_dir=structured_output_dir, num_workers=args.num_workers, task_ids=task_ids, + n_limit=args.n_limit, ) # Convert harbor output to standard format diff --git a/tests/test_terminalbench.py b/tests/test_terminalbench.py index 61ef31ed5..39a8d6ffa 100644 --- a/tests/test_terminalbench.py +++ b/tests/test_terminalbench.py @@ -5,8 +5,13 @@ import pytest +from benchmarks.terminalbench.config import INFER_DEFAULTS from benchmarks.terminalbench.eval_infer import process_terminalbench_results -from benchmarks.terminalbench.run_infer import convert_harbor_to_eval_output +from benchmarks.terminalbench.run_infer import ( + convert_harbor_to_eval_output, + run_harbor_evaluation, +) +from openhands.sdk import LLM class TestProcessTerminalbenchResults: @@ -206,6 +211,69 @@ def test_report_file_written(self, tmp_path: Path) -> None: assert "resolved_ids" in report +class TestRunHarborEvaluation: + """Tests for building Harbor invocation arguments.""" + + def test_default_dataset_matches_harbor_registry(self) -> None: + """Test that the default dataset name matches Harbor's published registry.""" + assert INFER_DEFAULTS["dataset"] == "terminal-bench@2.0" + + def test_run_harbor_evaluation_passes_filters_and_limits( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test Harbor command includes task filters and n-limit for CI runs.""" + captured: dict[str, list[str]] = {} + + def fake_run(cmd: list[str], capture_output: bool, text: bool): + captured["cmd"] = cmd + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr( + "benchmarks.terminalbench.run_infer.subprocess.run", fake_run + ) + + harbor_output_dir = run_harbor_evaluation( + llm=LLM( + model="litellm_proxy/test-model", + api_key="test-key", + base_url="https://proxy.example.com", + ), + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + num_workers=3, + task_ids=["task-a", "task-b"], + n_limit=5, + ) + + expected_output_dir = tmp_path / "harbor_output" + assert harbor_output_dir == expected_output_dir + + cmd = captured["cmd"] + assert cmd[:8] == [ + "harbor", + "run", + "-d", + "terminal-bench@2.0", + "-a", + "openhands-sdk", + "-m", + "litellm_proxy/test-model", + ] + assert "--jobs-dir" in cmd + assert str(expected_output_dir.resolve()) in cmd + assert cmd.count("--task-name") == 2 + assert "task-a" in cmd + assert "task-b" in cmd + assert cmd[cmd.index("--n-concurrent") + 1] == "3" + assert cmd[cmd.index("--n-tasks") + 1] == "5" + assert "LLM_API_KEY=test-key" in cmd + assert "LLM_BASE_URL=https://proxy.example.com" in cmd + + class TestConvertHarborToEvalOutput: """Tests for convert_harbor_to_eval_output function.""" @@ -292,7 +360,7 @@ def test_failed_trial(self, tmp_path: Path) -> None: assert entries[0]["metrics"]["total_cost_usd"] == 0.0 def test_trial_with_exception(self, tmp_path: Path) -> None: - """Test handling of a trial with exception.""" + """Test exception-only Harbor output is preserved for downstream reporting.""" trial_result = { "task_name": "error-task", "trial_name": "error-task__err", @@ -305,10 +373,26 @@ def test_trial_with_exception(self, tmp_path: Path) -> None: tmp_path, [("error-task__err", trial_result)] ) output_file = tmp_path / "output.jsonl" + report_file = tmp_path / "report.json" - # Should raise since all trials have exceptions and none succeeded - with pytest.raises(RuntimeError, match="All .* trials failed"): - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries == [ + { + "instance_id": "error-task", + "error": "{'type': 'TimeoutError', 'message': 'Agent timed out'}", + "test_result": {}, + } + ] + + report = process_terminalbench_results(str(output_file), str(report_file)) + assert report["total_instances"] == 1 + assert report["completed_instances"] == 0 + assert report["error_instances"] == 1 + assert report["incomplete_ids"] == ["error-task"] def test_mixed_valid_and_exception_trials(self, tmp_path: Path) -> None: """Test handling mix of successful and exception trials."""