From b0ccae6915f27183efae8773b21e5cf557df48d1 Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Tue, 28 Apr 2026 16:50:24 +0200 Subject: [PATCH 1/5] add guidellm preflight check and pin vllm to <=0.19 Signed-off-by: Vincent Gimenes --- auto_tune_vllm/benchmarks/providers.py | 31 +++++++++++++++++++++++--- pyproject.toml | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/auto_tune_vllm/benchmarks/providers.py b/auto_tune_vllm/benchmarks/providers.py index e426bbc..ae02349 100644 --- a/auto_tune_vllm/benchmarks/providers.py +++ b/auto_tune_vllm/benchmarks/providers.py @@ -210,6 +210,10 @@ def start_benchmark( if not (model_url.startswith("http://") or model_url.startswith("https://")): raise ValueError(f"Invalid model_url: {model_url!r} (expected http/https)") + env = os.environ.copy() + env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.logging_level + self._validate_guidellm_cli(env) + # Run GuideLLM self._logger.info(f"Running: {' '.join(cmd)}") self._logger.info(f"Results will be saved to: {self._results_file}") @@ -217,9 +221,6 @@ def start_benchmark( # Use Popen so we can terminate if vLLM dies # start_new_session=True puts it in its own process group for clean # termination - env = os.environ.copy() - env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.logging_level - self._process = subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -245,6 +246,30 @@ def start_benchmark( return self._process + def _validate_guidellm_cli(self, env: dict[str, str]) -> None: + """Fail fast if GuideLLM cannot start due to dependency/import issues.""" + try: + result = subprocess.run( + ["guidellm", "benchmark", "--help"], + capture_output=True, + text=True, + timeout=15, + env=env, + ) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + "GuideLLM CLI validation timed out while running " + "'guidellm benchmark --help'." + ) from exc + + if result.returncode != 0: + error_output = (result.stderr or result.stdout or "").strip() + raise RuntimeError( + "GuideLLM CLI validation failed while running " + f"'guidellm benchmark --help' (exit code {result.returncode}). " + f"Output: {error_output}" + ) + def parse_results(self) -> Dict[str, Any]: """ Parse GuideLLM benchmark results from output file. diff --git a/pyproject.toml b/pyproject.toml index 4c1f04a..37ab35e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ requires-python = ">=3.10" dependencies = [ "optuna>=3.0.0", "optuna-integration[botorch]>=4.0.0", - "vllm>=0.11.0", + "vllm>=0.11.0,<=0.19", "guidellm>=0.1.0", "pyyaml>=6.0", "pydantic>=2.0.0", From aaf573fe2fd48c51825efb323d02cda0bd1f38ec Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Mon, 4 May 2026 11:11:29 +0200 Subject: [PATCH 2/5] increase timeout Signed-off-by: Vincent Gimenes --- auto_tune_vllm/benchmarks/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_tune_vllm/benchmarks/providers.py b/auto_tune_vllm/benchmarks/providers.py index ae02349..64570bf 100644 --- a/auto_tune_vllm/benchmarks/providers.py +++ b/auto_tune_vllm/benchmarks/providers.py @@ -253,7 +253,7 @@ def _validate_guidellm_cli(self, env: dict[str, str]) -> None: ["guidellm", "benchmark", "--help"], capture_output=True, text=True, - timeout=15, + timeout=30, env=env, ) except subprocess.TimeoutExpired as exc: From 30d59330a28432913d13a3077abd8d37e480c2bd Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Tue, 12 May 2026 15:54:01 +0200 Subject: [PATCH 3/5] fix: fail fast on GuideLLM benchmark CLI preflight before vLLM startup --- auto_tune_vllm/benchmarks/providers.py | 28 +++++++++++++------- auto_tune_vllm/execution/trial_controller.py | 5 ++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/auto_tune_vllm/benchmarks/providers.py b/auto_tune_vllm/benchmarks/providers.py index 64570bf..000c19e 100644 --- a/auto_tune_vllm/benchmarks/providers.py +++ b/auto_tune_vllm/benchmarks/providers.py @@ -134,6 +134,10 @@ def terminate_benchmark(self): self._process_pid = None self._process_pgid = None + def validate_preflight(self, config: BenchmarkConfig) -> None: + """Validate benchmark toolchain before starting any server processes.""" + return + @abstractmethod def start_benchmark( self, model_url: str, config: BenchmarkConfig @@ -199,20 +203,12 @@ def start_benchmark( # Build GuideLLM command cmd = self._build_guidellm_command(model_url, config, self._results_file) - # Validate binary and basic inputs - import shutil - - if shutil.which("guidellm") is None: - raise RuntimeError( - "GuideLLM CLI not found on PATH. " - "Please install or provide the full path." - ) + # Validate benchmark target inputs if not (model_url.startswith("http://") or model_url.startswith("https://")): raise ValueError(f"Invalid model_url: {model_url!r} (expected http/https)") env = os.environ.copy() env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.logging_level - self._validate_guidellm_cli(env) # Run GuideLLM self._logger.info(f"Running: {' '.join(cmd)}") @@ -246,6 +242,20 @@ def start_benchmark( return self._process + def validate_preflight(self, config: BenchmarkConfig) -> None: + """Validate GuideLLM CLI before launching vLLM or benchmark.""" + import shutil + + if shutil.which("guidellm") is None: + raise RuntimeError( + "GuideLLM CLI not found on PATH. " + "Please install or provide the full path." + ) + + env = os.environ.copy() + env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.logging_level + self._validate_guidellm_cli(env) + def _validate_guidellm_cli(self, env: dict[str, str]) -> None: """Fail fast if GuideLLM cannot start due to dependency/import issues.""" try: diff --git a/auto_tune_vllm/execution/trial_controller.py b/auto_tune_vllm/execution/trial_controller.py index cca009f..626ec3c 100644 --- a/auto_tune_vllm/execution/trial_controller.py +++ b/auto_tune_vllm/execution/trial_controller.py @@ -337,6 +337,11 @@ def run_trial( # Setup benchmark provider self.benchmark_provider = self._create_benchmark_provider(trial_config) + controller_logger.info( + "Validating benchmark toolchain before starting vLLM server" + ) + self.benchmark_provider.validate_preflight(trial_config.benchmark_config) + controller_logger.info("Benchmark toolchain validation passed") # Setup cancellation checker function def should_cancel(): From c1568b16cc790ee1ce4dd3d70837bda98d355cd1 Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Wed, 13 May 2026 16:30:50 +0200 Subject: [PATCH 4/5] apply the check before everything --- auto_tune_vllm/benchmarks/providers.py | 42 -------------------- auto_tune_vllm/cli/main.py | 39 ++++++++++++++++++ auto_tune_vllm/execution/trial_controller.py | 5 --- 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/auto_tune_vllm/benchmarks/providers.py b/auto_tune_vllm/benchmarks/providers.py index 000c19e..75437b4 100644 --- a/auto_tune_vllm/benchmarks/providers.py +++ b/auto_tune_vllm/benchmarks/providers.py @@ -134,10 +134,6 @@ def terminate_benchmark(self): self._process_pid = None self._process_pgid = None - def validate_preflight(self, config: BenchmarkConfig) -> None: - """Validate benchmark toolchain before starting any server processes.""" - return - @abstractmethod def start_benchmark( self, model_url: str, config: BenchmarkConfig @@ -242,44 +238,6 @@ def start_benchmark( return self._process - def validate_preflight(self, config: BenchmarkConfig) -> None: - """Validate GuideLLM CLI before launching vLLM or benchmark.""" - import shutil - - if shutil.which("guidellm") is None: - raise RuntimeError( - "GuideLLM CLI not found on PATH. " - "Please install or provide the full path." - ) - - env = os.environ.copy() - env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.logging_level - self._validate_guidellm_cli(env) - - def _validate_guidellm_cli(self, env: dict[str, str]) -> None: - """Fail fast if GuideLLM cannot start due to dependency/import issues.""" - try: - result = subprocess.run( - ["guidellm", "benchmark", "--help"], - capture_output=True, - text=True, - timeout=30, - env=env, - ) - except subprocess.TimeoutExpired as exc: - raise RuntimeError( - "GuideLLM CLI validation timed out while running " - "'guidellm benchmark --help'." - ) from exc - - if result.returncode != 0: - error_output = (result.stderr or result.stdout or "").strip() - raise RuntimeError( - "GuideLLM CLI validation failed while running " - f"'guidellm benchmark --help' (exit code {result.returncode}). " - f"Output: {error_output}" - ) - def parse_results(self) -> Dict[str, Any]: """ Parse GuideLLM benchmark results from output file. diff --git a/auto_tune_vllm/cli/main.py b/auto_tune_vllm/cli/main.py index 96f669b..bee3733 100644 --- a/auto_tune_vllm/cli/main.py +++ b/auto_tune_vllm/cli/main.py @@ -1,7 +1,9 @@ """Command-line interface for auto-tune-vllm.""" import logging +import os import shutil +import subprocess import sys from pathlib import Path from typing import Optional @@ -109,6 +111,41 @@ def _display_log_viewing_instructions(config: StudyConfig): ) +def _run_guidellm_benchmark_help_preflight_for_optimize(config: StudyConfig) -> None: + """Once per `optimize` run: verify `guidellm benchmark --help` succeeds (GuideLLM only).""" + if config.benchmark.benchmark_type != "guidellm": + return + + console.print("[blue]Checking GuideLLM CLI (guidellm benchmark --help)...[/blue]") + if shutil.which("guidellm") is None: + raise RuntimeError( + "GuideLLM CLI not found on PATH. " + "Install GuideLLM or ensure `guidellm` is available." + ) + env = os.environ.copy() + env["GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL"] = config.benchmark.logging_level + try: + result = subprocess.run( + ["guidellm", "benchmark", "--help"], + capture_output=True, + text=True, + timeout=30, + env=env, + ) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + "GuideLLM CLI check timed out while running 'guidellm benchmark --help'." + ) from exc + + if result.returncode != 0: + err = (result.stderr or result.stdout or "").strip() + raise RuntimeError( + f"GuideLLM CLI check failed (exit code {result.returncode}) " + f"for 'guidellm benchmark --help'. Output: {err}" + ) + console.print("[green]✓ GuideLLM CLI OK[/green]") + + @app.command("optimize") def optimize_command( config: str = typer.Option(..., "--config", "-c", help="Study configuration file"), @@ -405,6 +442,8 @@ def run_optimization_sync( "so startup sampling would consume the full trial budget. " f"n_startup_trials is now {config.optimization.n_startup_trials}.[/yellow]" ) + _run_guidellm_benchmark_help_preflight_for_optimize(config) + # Create study controller (uses config with possibly updated sampler/n_trials) controller = StudyController.create_from_config( backend, config, create_db=create_db diff --git a/auto_tune_vllm/execution/trial_controller.py b/auto_tune_vllm/execution/trial_controller.py index 626ec3c..cca009f 100644 --- a/auto_tune_vllm/execution/trial_controller.py +++ b/auto_tune_vllm/execution/trial_controller.py @@ -337,11 +337,6 @@ def run_trial( # Setup benchmark provider self.benchmark_provider = self._create_benchmark_provider(trial_config) - controller_logger.info( - "Validating benchmark toolchain before starting vLLM server" - ) - self.benchmark_provider.validate_preflight(trial_config.benchmark_config) - controller_logger.info("Benchmark toolchain validation passed") # Setup cancellation checker function def should_cancel(): From 472492bf1669e5c337b0954ff83961b3d79a835e Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Fri, 15 May 2026 11:54:24 +0200 Subject: [PATCH 5/5] increase timeout --- auto_tune_vllm/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_tune_vllm/cli/main.py b/auto_tune_vllm/cli/main.py index bee3733..7e211f0 100644 --- a/auto_tune_vllm/cli/main.py +++ b/auto_tune_vllm/cli/main.py @@ -129,7 +129,7 @@ def _run_guidellm_benchmark_help_preflight_for_optimize(config: StudyConfig) -> ["guidellm", "benchmark", "--help"], capture_output=True, text=True, - timeout=30, + timeout=120, env=env, ) except subprocess.TimeoutExpired as exc: