diff --git a/auto_tune_vllm/core/config.py b/auto_tune_vllm/core/config.py index 420ef42..bc6b8d5 100644 --- a/auto_tune_vllm/core/config.py +++ b/auto_tune_vllm/core/config.py @@ -40,15 +40,18 @@ class ObjectiveConfig: direction: str # "maximize" or "minimize" valid_metrics = { - "output_tokens_per_second", - "request_latency", - "time_to_first_token_ms", - "inter_token_latency_ms", - "requests_per_second", - } + "output_tokens_per_second", + "request_latency", + "time_to_first_token_ms", + "inter_token_latency_ms", + "requests_per_second", + } valid_directions = {"maximize", "minimize"} valid_percentiles = {"median", "p50", "p95", "p90", "p99", "mean"} - valid_metrics_combined = {f"{metric}_{percentile}" for metric, percentile in product(valid_metrics, valid_percentiles)} + valid_metrics_combined = { + f"{metric}_{percentile}" + for metric, percentile in product(valid_metrics, valid_percentiles) + } def _break_down_objectives(self) -> list[str]: """ @@ -66,9 +69,7 @@ def _break_down_objectives(self) -> list[str]: try: tree = ast.parse(self.metric, mode="eval") except SyntaxError as e: - raise ValueError( - f"Invalid metric expression {self.metric!r}: {e}" - ) from e + raise ValueError(f"Invalid metric expression {self.metric!r}: {e}") from e metrics: list[str] = [] seen: set[str] = set() @@ -122,26 +123,45 @@ class OptimizationConfig: approach: Optional[str] = None # "single_objective" or "multi_objective" objectives: Optional[List[ObjectiveConfig]] = None # For multi-objective preset: Optional[str] = None # "high_throughput", "low_latency", "balanced" + log_metrics: Optional[List[str]] = ( + None # Optional metrics copied to Optuna trial user attrs (dashboard) + ) def __post_init__(self): """Process and validate optimization configuration.""" - # Handle preset configurations if self.preset: self._apply_preset() - return - - # Handle new structured format - if self.approach: + elif self.approach: self._validate_structured_format() - return - - # Handle backward compatibility (old format) - if self.objective: + elif self.objective: self._convert_old_format() - return + else: + self._apply_default_config() + self._validate_log_metrics() - # Default fallback - self._apply_default_config() + def _validate_log_metrics(self) -> None: + """Normalize and validate log_metrics (independent of objective setup).""" + if self.log_metrics is None: + self.log_metrics = [] + return + if not isinstance(self.log_metrics, list): + raise ValueError( + "log_metrics must be a list of metric identifier strings, " + f"got {type(self.log_metrics).__name__}" + ) + valid = ObjectiveConfig.valid_metrics_combined + for name in self.log_metrics: + if not isinstance(name, str): + raise ValueError( + "log_metrics entries must be strings, " + f"got {type(name).__name__}: {name!r}" + ) + if name not in valid: + raise ValueError( + f"Unknown metric {name!r} in log_metrics. " + f"Each entry must be a single identifier from " + f"{sorted(valid)}" + ) def _apply_preset(self): """Apply preset optimization configurations.""" diff --git a/auto_tune_vllm/core/study_controller.py b/auto_tune_vllm/core/study_controller.py index fd20573..1fb3c90 100644 --- a/auto_tune_vllm/core/study_controller.py +++ b/auto_tune_vllm/core/study_controller.py @@ -916,6 +916,8 @@ def _set_trial_user_attributes(self, trial_number: int, result: TrialResult): f"Stored error attributes for trial {trial_number}: {result.error_type}" ) + self._set_log_metric_user_attrs(trial, result) + # Log timing attributes stored if result.execution_info: logger.debug( @@ -926,6 +928,42 @@ def _set_trial_user_attributes(self, trial_number: int, result: TrialResult): f"status={result.execution_info.trial_status}" ) + def _set_log_metric_user_attrs( + self, trial: optuna.Trial, result: TrialResult + ) -> None: + """ + Copy selected benchmark scalars onto the Optuna trial as user attributes + for dashboard visibility (not objectives; not passed to study.tell). + Applies to optimization trials and baseline reference trials alike. + """ + if result.trial_type not in ("optimization", "baseline"): + return + names = self.config.optimization.log_metrics + if not names or not result.success or not result.detailed_metrics: + return + for name in names: + if name not in result.detailed_metrics: + logger.warning( + "log_metrics: metric %r not found in detailed_metrics for trial %s; " + "skipping user attr", + name, + result.trial_number, + ) + continue + raw = result.detailed_metrics[name] + try: + value = float(raw) + except (TypeError, ValueError): + logger.warning( + "log_metrics: cannot coerce metric %r value %r to float for " + "trial %s; skipping user attr", + name, + raw, + result.trial_number, + ) + continue + trial.set_user_attr(f"metric_{name}", value) + def get_best_baseline_result(self) -> list[float] | None: """Get the best baseline result for comparison.""" if not self.baseline_results: diff --git a/docs/configuration.md b/docs/configuration.md index a6357f6..f94d3be 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -180,6 +180,25 @@ Number of optimization trials to run. Each trial tests one parameter combination #### `n_startup_trials` (integer, optional) Number of random trials to run before starting the main sampler algorithm. Only supported by some samplers (TPE, BoTorch). Helps initialize the sampler with diverse data points. +#### `log_metrics` (list of strings, optional) +Extra benchmark scalars to copy onto each **Optuna trial** as [user attributes](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial.set_user_attr), mainly so tools like **Optuna Dashboard** can plot or filter on them alongside objectives. + +- **Semantics**: This does **not** change the optimization objective. It only stores additional numbers on the trial record after a successful benchmark. +- **Identifiers**: Each list entry must be a single metric id in the same `_` form as in objective expressions (see **`objectives`** above), e.g. `request_latency_p95`, `output_tokens_per_second_median`. Allowed names are exactly the combined identifiers derived from the base metrics and percentiles documented for objectives. +- **Storage**: For each configured name, the runner writes `trial.set_user_attr("metric_", float_value)` using the value from the trial’s `detailed_metrics`. If a name is missing from `detailed_metrics`, or the value cannot be converted to a float, a warning is logged and that attribute is skipped. +- **Trials**: Applied to **optimization** and **baseline** trials when the run succeeds and detailed metrics are present. Omitted or unset `log_metrics` is treated as an empty list. + +Example: + +```yaml +optimization: + preset: "balanced" + n_trials: 50 + log_metrics: + - "inter_token_latency_ms_p95" + - "time_to_first_token_ms_median" +``` + ### Preset Configurations Explained #### High Throughput Preset diff --git a/examples/study_config.yaml b/examples/study_config.yaml index fe7b514..7b73d60 100644 --- a/examples/study_config.yaml +++ b/examples/study_config.yaml @@ -12,6 +12,12 @@ optimization: direction: "maximize" - metric: "time_to_first_token_ms_p95" # Worst-case TTFT direction: "minimize" + # Optional: copy extra benchmark scalars to Optuna user attrs (metric_) + # for the dashboard; entries must be single combined metric keys matching + # ObjectiveConfig metric identifiers (for example, "inter_token_latency_ms_p95"), + # not arithmetic expressions. + # log_metrics: + # - "inter_token_latency_ms_p95" sampler: "nsga2" # Best for multi-objective optimization n_trials: 11 @@ -32,9 +38,3 @@ parameters: max_num_batched_tokens: enabled: true options: [1024, 2048, 10000] - - gpu_memory_utilization: - enabled: true - min: 0.9 - max: 0.92 - step: 0.01 diff --git a/optuna_dashboard/study.db b/optuna_dashboard/study.db index 0a4fc25..d8f061b 100644 Binary files a/optuna_dashboard/study.db and b/optuna_dashboard/study.db differ diff --git a/tests/core/test_optimization_config.py b/tests/core/test_optimization_config.py new file mode 100644 index 0000000..5725288 --- /dev/null +++ b/tests/core/test_optimization_config.py @@ -0,0 +1,65 @@ +"""Unit tests for OptimizationConfig.log_metrics validation.""" + +from __future__ import annotations + +import pytest + +from auto_tune_vllm.core.config import ObjectiveConfig, OptimizationConfig + + +def test_log_metrics_default_normalized_to_empty_list(): + cfg = OptimizationConfig( + approach="single_objective", + objectives=[ + ObjectiveConfig( + metric="output_tokens_per_second_mean", + direction="maximize", + ) + ], + ) + assert cfg.log_metrics == [] + + +def test_log_metrics_valid_entries(): + cfg = OptimizationConfig( + approach="single_objective", + objectives=[ + ObjectiveConfig( + metric="output_tokens_per_second_mean", + direction="maximize", + ) + ], + log_metrics=["time_to_first_token_ms_p95", "request_latency_median"], + ) + assert cfg.log_metrics == [ + "time_to_first_token_ms_p95", + "request_latency_median", + ] + + +def test_log_metrics_invalid_metric_raises(): + with pytest.raises(ValueError, match="Unknown metric"): + OptimizationConfig( + approach="single_objective", + objectives=[ + ObjectiveConfig( + metric="output_tokens_per_second_mean", + direction="maximize", + ) + ], + log_metrics=["not_a_valid_metric_p95"], + ) + + +def test_log_metrics_wrong_container_type_raises(): + with pytest.raises(ValueError, match="log_metrics must be a list"): + OptimizationConfig( + approach="single_objective", + objectives=[ + ObjectiveConfig( + metric="output_tokens_per_second_mean", + direction="maximize", + ) + ], + log_metrics="time_to_first_token_ms_p95", # type: ignore[arg-type] + )