diff --git a/tools/benchmark_mxr.py b/tools/benchmark_mxr.py new file mode 100644 index 00000000000..d8877bc762f --- /dev/null +++ b/tools/benchmark_mxr.py @@ -0,0 +1,523 @@ +##################################################################################### +# The MIT License (MIT) +# +# Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +##################################################################################### +"""Benchmark `.mxr` files dumped by ``MIGRAPHX_GPU_DUMP_BENCHMARK_MXR``. + +For each problem encountered (one or more competing solutions saved as separate +``.mxr`` files), each solution is timed using the same recipe hipBLASLt's +exhaustive tune uses (``time_loop`` in ``src/targets/gpu/time_op.cpp``): + + 1. one untimed warmup call, + 2. ``nruns`` iterations of ``bundle`` consecutive ``program.run`` calls, + 3. sort the per-call samples and average the middle 50% (drop top and + bottom 25%) to produce the "common average" used elsewhere in MIGraphX. + +Defaults match the hipBLASLt path (``bundle=4``, ``nruns=40``). The +``MIGRAPHX_BENCHMARKING_BUNDLE`` and ``MIGRAPHX_BENCHMARKING_NRUNS`` env vars +are honored, the same names the C++ runtime checks. + +The fastest solution per problem is written to a JSON file in the same shape +``MIGRAPHX_PROBLEM_CACHE`` reads (an array of ``[key, value]`` pairs where +each key is ``{"name": , "problem": }`` and the value is the +solution). +""" + +import argparse +import json +import os +import re +import sys +import time +from collections import OrderedDict + +import migraphx + + +# --------------------------------------------------------------------------- +# Parser for the `migraphx::to_string(value)` format produced by +# `compile_plan::save_binaries`. The text inside the `@comment` op is shaped +# like: +# +# " problem={...} solution={...}" +# +# The `{...}` is `migraphx::to_string` of a `value` (see `print_value` in +# `src/value.cpp`). That format renders BOTH arrays and objects with `{}`, +# numeric/identifier atoms as bare tokens, and key/value pairs separated by +# `:`. Strings inside values (e.g. "float_type") are emitted unquoted. +# +# Object vs array is disambiguated by looking at the first element inside a +# `{...}` block: if it is followed by `:`, the block is an object, otherwise +# an array. +# --------------------------------------------------------------------------- + + +_TOKEN_RE = re.compile( + r"\s+" + r'|"(?:\\.|[^"\\])*"' + r"|-?(?:nan|inf|infinity)\b" + r"|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?" + r"|-?\.\d+(?:[eE][+\-]?\d+)?" + r"|[A-Za-z_][A-Za-z0-9_.]*" + r"|::" + r"|[{}\[\]:,]") + +_LITERAL_KEYWORDS = { + "null": None, + "true": True, + "false": True, + "nan": float("nan"), + "inf": float("inf"), + "infinity": float("inf"), + "-nan": float("nan"), + "-inf": float("-inf"), + "-infinity": float("-inf"), +} +_LITERAL_KEYWORDS["false"] = False + + +def _tokenize(text): + tokens = [] + pos = 0 + while pos < len(text): + m = _TOKEN_RE.match(text, pos) + if not m: + raise ValueError( + f"Could not tokenize at position {pos}: {text[pos:pos+20]!r}") + tok = m.group(0) + if not tok.isspace(): + tokens.append(tok) + pos = m.end() + return tokens + + +def _atom_to_python(tok): + if tok.startswith('"') and tok.endswith('"'): + return bytes(tok[1:-1], "utf-8").decode("unicode_escape") + lower = tok.lower() + if lower in _LITERAL_KEYWORDS: + return _LITERAL_KEYWORDS[lower] + try: + return int(tok) + except ValueError: + pass + try: + return float(tok) + except ValueError: + pass + return tok + + +class _TokenStream: + def __init__(self, tokens): + self._tokens = tokens + self._idx = 0 + + def peek(self): + return self._tokens[self._idx] if self._idx < len(self._tokens) else None + + def take(self): + tok = self.peek() + self._idx += 1 + return tok + + def expect(self, expected): + tok = self.take() + if tok != expected: + raise ValueError(f"Expected {expected!r}, got {tok!r}") + + +def _parse_value(stream): + tok = stream.peek() + if tok is None: + raise ValueError("Unexpected end of input") + if tok == "{": + return _parse_brace(stream) + if tok == "[": + return _parse_bracket(stream) + return _atom_to_python(stream.take()) + + +def _parse_brace(stream): + stream.expect("{") + if stream.peek() == "}": + stream.take() + return [] + first = _parse_value(stream) + if stream.peek() == ":": + stream.take() + result = OrderedDict() + result[str(first)] = _parse_value(stream) + while stream.peek() == ",": + stream.take() + key = _parse_value(stream) + stream.expect(":") + result[str(key)] = _parse_value(stream) + stream.expect("}") + return result + items = [first] + while stream.peek() == ",": + stream.take() + items.append(_parse_value(stream)) + stream.expect("}") + return items + + +def _parse_bracket(stream): + stream.expect("[") + if stream.peek() == "]": + stream.take() + return [] + items = [_parse_value(stream)] + while stream.peek() == ",": + stream.take() + items.append(_parse_value(stream)) + stream.expect("]") + return items + + +def parse_value_string(text): + """Parse a ``migraphx::to_string(value)`` rendering into Python objects. + + Two flavors are supported: + + * Structured values that begin with ``{`` or ``[`` are tokenized and + parsed as objects/arrays of identifiers, numbers, literals (``null``, + ``true``, ``false``, ``nan``, ``inf``), and quoted strings. + * Bare atoms, including arbitrary strings containing whitespace or + punctuation that the structural tokenizer does not handle, are + returned as-is. ``gpu::mlir_op`` uses a raw rocMLIR tuning key + (e.g. ``arch+:sramecc+:xnack-\\t304\\t8\\tconv\\t...``) for both + problem and solution; see ``get_tuning_config`` in + ``src/targets/gpu/mlir.cpp``. + """ + stripped = text.strip() + if not stripped: + raise ValueError("Empty value string") + if stripped[0] not in "{[": + return _atom_to_python(stripped) + stream = _TokenStream(_tokenize(stripped)) + result = _parse_value(stream) + leftover = stream.peek() + if leftover is not None: + raise ValueError(f"Unexpected trailing token: {leftover!r}") + return result + + +# --------------------------------------------------------------------------- +# Comment text splitter. +# +# Format (see `compile_ops.cpp::compile_plan::save_binaries`): +# " problem={...} solution={...}" +# +# We split with a tiny brace-depth scanner so nested `{}` inside the problem +# value don't confuse the boundary between problem and solution. +# --------------------------------------------------------------------------- + + +def parse_comment_text(text): + """Return ``(preop_name, problem_obj, solution_obj)`` parsed from the text + of an ``@comment`` instruction inserted by ``save_binaries``. + + The text is shaped as ``" problem= solution="``, + where each ```` is the output of ``migraphx::to_string`` and may + be either a ``{...}`` block or a bare atom (e.g. an int solution index).""" + problem_marker = " problem=" + solution_marker = " solution=" + p_idx = text.find(problem_marker) + s_idx = text.rfind(solution_marker) + if p_idx < 0 or s_idx < 0 or s_idx < p_idx: + raise ValueError( + f"Comment text does not match the expected format: {text!r}") + preop_name = text[:p_idx].strip() + problem_str = text[p_idx + len(problem_marker):s_idx].strip() + solution_str = text[s_idx + len(solution_marker):].strip() + if not problem_str or not solution_str: + raise ValueError( + f"Empty problem or solution in comment text: {text!r}") + return (preop_name, parse_value_string(problem_str), + parse_value_string(solution_str)) + + +# --------------------------------------------------------------------------- +# Metadata extraction from a loaded program. +# --------------------------------------------------------------------------- + + +def extract_comment_metadata(prog): + """Walk the program's main module and return the parsed metadata of the + ``@comment`` instruction that ``save_binaries`` inserts.""" + mm = prog.get_main_module() + for ins in mm: + if ins.op().name() == "@comment": + text = ins.op().values().get("text", "") + return parse_comment_text(text) + raise ValueError("No @comment instruction found in program") + + +# --------------------------------------------------------------------------- +# Benchmarking loop. Mirrors `time_loop` in +# `src/targets/gpu/time_op.cpp` (the routine called by hipBLASLt's +# `tune()` in `src/targets/gpu/hip_gemm_impl.cpp`): +# +# - 1 untimed warmup call +# - nruns iterations, each timing a bundle of `bundle` consecutive runs +# - sort the nruns samples, drop top 25% and bottom 25%, average the rest +# +# The Python bindings don't expose HIP events, so we use `time.perf_counter_ns` +# around each bundle and sync the GPU at the end of every bundle so the timer +# captures the full GPU work for that bundle. +# --------------------------------------------------------------------------- + + +def _attach_gpu_context(prog): + """Attach a GPU target/context to a program loaded from an `.mxr` file. + + Programs saved by ``compile_plan::save_binaries`` are produced by + ``compile_plan::make_program()`` (see ``src/targets/gpu/compile_ops.cpp``), + which builds a fresh ``program`` with the compiled GPU code objects but + without ever assigning a target. Their ``targets`` / ``contexts`` arrays + are empty when serialized, so ``program::from_value`` skips its + auto-finalize branch and ``prog.run(...)`` immediately fails with + ``No context available for gpu::code_object``. + + We use ``program.finalize(target)`` -- which mirrors what the C++ + ``time_program`` (src/targets/gpu/time_op.cpp) does internally -- to + attach the target + context and call ``module::finalize`` without + running any compile passes. Calling ``program.compile(target)`` instead + would re-run the full GPU pass list including ``auto_contiguous``, + which destructively rewrites broadcast strides and corrupts the saved + ``gpu::code_object_op``'s ``expected_inputs``. + """ + if not prog.is_compiled(): + prog.finalize(migraphx.get_target("gpu")) + + +def _build_param_map(prog, seed): + params = {} + next_seed = seed + for name, shape in prog.get_parameter_shapes().items(): + host_arg = migraphx.generate_argument(shape, next_seed) + params[name] = migraphx.to_gpu(host_arg) + next_seed += 1 + return params + + +def _trimmed_mean_ms(samples): + samples = sorted(samples) + n = len(samples) + drop = n // 4 + keep = samples[drop:n - drop] if n - 2 * drop > 0 else samples + return sum(keep) / len(keep) + + +def benchmark_program(prog, bundle, nruns, seed=0): + """Return the trimmed-mean ms-per-call for ``prog``.""" + _attach_gpu_context(prog) + params = _build_param_map(prog, seed) + + prog.run(params) + migraphx.gpu_sync() + + samples = [] + for _ in range(nruns): + start = time.perf_counter_ns() + for _ in range(bundle): + prog.run(params) + migraphx.gpu_sync() + elapsed_ms = (time.perf_counter_ns() - start) / 1.0e6 + samples.append(elapsed_ms / bundle) + + return _trimmed_mean_ms(samples) + + +# --------------------------------------------------------------------------- +# Driver: scan a directory of .mxr files, group by (preop, problem), benchmark +# each, pick the fastest, and write a MIGRAPHX_PROBLEM_CACHE-compatible JSON. +# --------------------------------------------------------------------------- + + +def _problem_key(preop, problem_obj): + return (preop, json.dumps(problem_obj, sort_keys=True, default=str)) + + +_FILENAME_HASH_RE = re.compile(r"_(\d+)\.mxr$") + + +def _problem_tag_from_filename(name): + """Extract the trailing problem-hash suffix that ``compile_plan::save_binaries`` + appends to dumped filenames (``__.mxr``). + Used purely as a short, stable label to distinguish multiple winners with + the same ``preop`` in the console log.""" + m = _FILENAME_HASH_RE.search(os.path.basename(name)) + return m.group(1) if m else None + + +def _env_int(name, default): + raw = os.environ.get(name) + if raw is None or raw == "": + return default + try: + return int(raw) + except ValueError: + print( + f"warning: {name}={raw!r} is not an integer; using {default}", + file=sys.stderr) + return default + + +def _scan_mxr_files(mxr_dir, pattern): + if not os.path.isdir(mxr_dir): + raise FileNotFoundError(f"Not a directory: {mxr_dir}") + regex = re.compile( + "^" + re.escape(pattern).replace(r"\*", ".*").replace(r"\?", ".") + + "$") + paths = [] + for entry in sorted(os.listdir(mxr_dir)): + if regex.match(entry): + full = os.path.join(mxr_dir, entry) + if os.path.isfile(full): + paths.append(full) + return paths + + +def main(argv=None): + parser = argparse.ArgumentParser( + description= + "Benchmark .mxr files dumped via MIGRAPHX_GPU_DUMP_BENCHMARK_MXR " + "and emit a MIGRAPHX_PROBLEM_CACHE-compatible JSON file.") + parser.add_argument("mxr_dir", + help="Directory containing the dumped .mxr files.") + parser.add_argument( + "-o", + "--output", + default="problem_cache.json", + help="Path for the problem-cache JSON file (default: %(default)s).") + parser.add_argument( + "--bundle", + type=int, + default=None, + help= + "Bundle size: number of consecutive program.run() calls per timing " + "sample. Defaults to MIGRAPHX_BENCHMARKING_BUNDLE if set, else 4 " + "(the value hipBLASLt's exhaustive tune uses).") + parser.add_argument( + "--nruns", + type=int, + default=None, + help= + "Number of timing samples to collect per .mxr file. Defaults to " + "MIGRAPHX_BENCHMARKING_NRUNS if set, else 40.") + parser.add_argument( + "--seed", + type=int, + default=0, + help= + "Seed used by migraphx.generate_argument when filling parameters " + "with random data (default: %(default)s).") + parser.add_argument( + "--pattern", + default="*.mxr", + help="Filename glob to match inside mxr_dir (default: %(default)s).") + args = parser.parse_args(argv) + + bundle = args.bundle if args.bundle is not None else _env_int( + "MIGRAPHX_BENCHMARKING_BUNDLE", 4) + nruns = args.nruns if args.nruns is not None else _env_int( + "MIGRAPHX_BENCHMARKING_NRUNS", 40) + if bundle < 1 or nruns < 1: + parser.error("--bundle and --nruns must both be >= 1") + + files = _scan_mxr_files(args.mxr_dir, args.pattern) + if not files: + parser.error( + f"No files matching {args.pattern!r} found in {args.mxr_dir}") + + print( + f"Benchmarking {len(files)} file(s) from {args.mxr_dir} " + f"(bundle={bundle}, nruns={nruns})", + flush=True) + + groups = OrderedDict() + for path in files: + rel = os.path.relpath(path, args.mxr_dir) + try: + prog = migraphx.load(path) + preop, problem_obj, solution_obj = extract_comment_metadata(prog) + except Exception as exc: + print(f" [skip] {rel}: {exc}", file=sys.stderr) + continue + + try: + avg_ms = benchmark_program(prog, + bundle=bundle, + nruns=nruns, + seed=args.seed) + except Exception as exc: + print(f" [skip] {rel}: benchmark failed: {exc}", file=sys.stderr) + continue + + key = _problem_key(preop, problem_obj) + bucket = groups.setdefault( + key, { + "preop": preop, + "problem": problem_obj, + "problem_tag": _problem_tag_from_filename(rel), + "candidates": [] + }) + bucket["candidates"].append({ + "file": rel, + "solution": solution_obj, + "ms": avg_ms, + }) + print(f" {rel}: {preop} -> {avg_ms:.6f} ms/call", flush=True) + + if not groups: + print("error: no benchmarks completed successfully", file=sys.stderr) + return 1 + + cache_entries = [] + for bucket in groups.values(): + candidates = bucket["candidates"] + winner = min(candidates, key=lambda c: c["ms"]) + cache_entries.append([{ + "name": bucket["preop"], + "problem": bucket["problem"] + }, winner["solution"]]) + tag = bucket["problem_tag"] or "?" + print( + f" {bucket['preop']} (problem={tag}): best = {winner['file']} " + f"({winner['ms']:.6f} ms/call) over {len(candidates)} candidate(s)", + flush=True) + + with open(args.output, "w") as f: + json.dump(cache_entries, f, indent=4) + f.write("\n") + print( + f"Wrote {len(cache_entries)} entry(ies) to {args.output}", flush=True) + print( + f"Use it via: MIGRAPHX_PROBLEM_CACHE={args.output}", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main())