GraphBench/graph_bench.cpp at main · saleelk/GraphBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
// SPDX-License-Identifier: MIT
// Copyright (c) 2026 Saleel Kudchadker
//
// graph_bench.cpp
//
// Measures hipGraphLaunch latency across several graph topologies and sizes.
// All topologies use approximately N total kernel nodes so that comparisons
// are apples-to-apples.
//
// Topologies
// ----------
//   straight  1 segment of N nodes chained linearly.
//
//   paths2    Hexagon-style with 2 parallel branches:
//               lead (N/4) -> branch_0 (N/4) + branch_1 (N/4) -> tail (N/4)
//             4 segments total.
//
//   paths4    Same pattern with 4 parallel branches, each of length N/6.
//             6 segments total.
//
//   full2     2 fully independent chains of N/2 nodes. No sync point.
//
//   full4     4 fully independent chains of N/4 nodes. No sync point.
//
// Verification (--verify)
// -----------------------
// Replaces null_kernel with a reduction-based ordering check:
//
//   verify_kernel writes:
//     buf[nodeId] = sum(buf[dep_ids]) + 1
//
// For a root node (no deps) this is 1.  For a chain node it is predecessor+1.
// For the hexagon join it is the sum of all branch-tail values plus 1.
//
// Expected values are computed on the CPU at build time using the same
// recurrence.  Only the graph's exit node(s) are checked:
//
//   straight / paths*  : one exit (tail of trailing chain)
//   full2 / full4      : one exit per independent chain
//
// If any node ran before its dependencies, its buf slot holds a smaller value
// than expected (it read zeros instead of its predecessor's value), and that
// propagates to the exit — so a single wrong exit value catches the race.
//
// Example  paths2, N=12, seg=3:
//   Lead:    buf[0]=1  buf[1]=2  buf[2]=3
//   Branch0: buf[3]=4  buf[4]=5  buf[5]=6
//   Branch1: buf[6]=4  buf[7]=5  buf[8]=6
//   Join:    buf[9] = buf[5]+buf[8]+1 = 13  (wrong if either branch not done)
//   Tail:    buf[10]=14  buf[11]=15
//   Check:   buf[11] == 15
//
// Build (HIP/AMD):
//   /opt/rocm/bin/hipcc -O2 -o graph_bench graph_bench.cpp
//   cmake -B build -DCMAKE_PREFIX_PATH=/opt/rocm && cmake --build build
//
// Build (CUDA/NVIDIA):
//   nvcc -O2 -x cu -o graph_bench graph_bench.cpp
//   cmake -B build -DUSE_CUDA=ON && cmake --build build
//
// Usage:
//   ./graph_bench [--size N] [--iters N] [--no-sync] [--sync]
//                 [--sweep] [--topology <name>] [--instantiate]
//                 [--verify] [--verify-iters N] [--verify-delay-us N]
//
//   --size N            Total kernel nodes (default: 1024)
//   --graphSize N       Alias for --size
//   --iters N           Timed repetitions per measurement (default: 1000)
//   --no-sync           Submission latency only (default)
//   --sync              Submission + GPU execution latency
//   --sweep             Run across all sizes: 1, 2, 4, ..., 8192
//   --topology <name>   Benchmark only the named topology (default: all)
//   --instantiate       Measure hipGraphInstantiate time (alongside launch)
//   --verify            Run ordering correctness check instead of timing
//   --verify-iters N    Verify launches per topology (default: 50)
//   --verify-delay-us N Per-node busy-wait to widen race window (default: 1)

// ---------------------------------------------------------------------------
// HIP / CUDA portability layer
// Code is written against the HIP API; when compiled with nvcc the hip*
// symbols are remapped to their cuda* equivalents.
// ---------------------------------------------------------------------------
#if defined(__NVCC__)
#include <cuda_runtime.h>
// Types
#define hipError_t            cudaError_t
#define hipDeviceProp_t       cudaDeviceProp
#define hipStream_t           cudaStream_t
#define hipGraph_t            cudaGraph_t
#define hipGraphExec_t        cudaGraphExec_t
#define hipGraphNode_t        cudaGraphNode_t
#define hipKernelNodeParams   cudaKernelNodeParams
// Error values
#define hipSuccess            cudaSuccess
#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
// Runtime
#define hipGetDevice            cudaGetDevice
#define hipGetDeviceProperties  cudaGetDeviceProperties
#define hipGetErrorString       cudaGetErrorString
#define hipStreamCreate         cudaStreamCreate
#define hipStreamDestroy        cudaStreamDestroy
#define hipStreamSynchronize    cudaStreamSynchronize
#define hipMalloc               cudaMalloc
#define hipFree                 cudaFree
#define hipMemset               cudaMemset
#define hipMemcpy               cudaMemcpy
// Graph
#define hipGraphCreate                          cudaGraphCreate
#define hipGraphDestroy                         cudaGraphDestroy
#define hipGraphAddKernelNode                   cudaGraphAddKernelNode
#define hipGraphInstantiate(e,g,_,__,f)         cudaGraphInstantiate(e,g,nullptr,nullptr,f)
#define hipGraphExecDestroy                     cudaGraphExecDestroy
#define hipGraphLaunch                          cudaGraphLaunch
// Stream capture
#define hipStreamBeginCapture         cudaStreamBeginCapture
#define hipStreamEndCapture           cudaStreamEndCapture
#define hipStreamCaptureModeGlobal    cudaStreamCaptureModeGlobal
#else
#include <hip/hip_runtime.h>
#endif

#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <numeric>
#include <string>
#include <unordered_map>
#include <vector>

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

#define HIP_CHECK(expr)                                                        \
  do {                                                                         \
    hipError_t _e = (expr);                                                    \
    if (_e != hipSuccess) {                                                    \
      fprintf(stderr, "HIP error %d (%s) at %s:%d\n", _e,                     \
              hipGetErrorString(_e), __FILE__, __LINE__);                      \
      exit(1);                                                                 \
    }                                                                          \
  } while (0)

class Timer {
 public:
  void reserve(int n) { samples_.reserve(n); }

  void start() { t_ = std::chrono::high_resolution_clock::now(); }

  void stop() {
    samples_.push_back(std::chrono::duration<double, std::micro>(
                           std::chrono::high_resolution_clock::now() - t_)
                           .count());
  }

  double avg() const {
    return std::accumulate(samples_.begin(), samples_.end(), 0.0) /
           samples_.size();
  }

 private:
  std::chrono::high_resolution_clock::time_point t_;
  std::vector<double> samples_;
};

// ---------------------------------------------------------------------------
// Kernels
// ---------------------------------------------------------------------------

__global__ void null_kernel() {}

// Reduction-based ordering kernel.
//
// Writes buf[nodeId] = buf[d0] + buf[d1] + buf[d2] + buf[d3] + 1
// where d0..d3 are the IDs of this node's graph predecessors (unused slots
// are 0 and excluded via ndeps).
//
// If any predecessor has not yet written its slot (i.e. ran out of order),
// buf[dep] is still 0 and the computed value will be smaller than expected.
// That deficit propagates through the chain to the exit node, where a single
// comparison catches the ordering violation.
//
// Supports up to 4 predecessor IDs, which is enough for paths4 (join has 4).
//
// Race-window amplification: dependencies are read *early* (at kernel entry)
// and this node's own result is written *late* (after an optional busy-wait of
// delay_cycles).  If the runtime incorrectly schedules a successor before this
// node finishes, that successor reads our slot while it is still 0, producing a
// detectable deficit.  With near-zero-duration kernels the window is too small
// to observe such bugs; the delay makes it reliably catchable.
__global__ void verify_kernel(int* buf, int nodeId,
                               int d0, int d1, int d2, int d3, int ndeps,
                               long long delay_cycles) {
  int val = 1;
  if (ndeps > 0) val += buf[d0];
  if (ndeps > 1) val += buf[d1];
  if (ndeps > 2) val += buf[d2];
  if (ndeps > 3) val += buf[d3];

  if (delay_cycles > 0) {
    const long long start = clock64();
    long long now = start;
    while (now - start < delay_cycles) now = clock64();
  }

  buf[nodeId] = val;
}

// ---------------------------------------------------------------------------
// Verification context
// ---------------------------------------------------------------------------

// Passed to graph builders when --verify is active.
//
// add_node() auto-assigns a node ID, registers the handle->ID mapping (so
// that later nodes can resolve their predecessor IDs by handle), computes the
// expected output value using the same recurrence as verify_kernel, and wires
// up verify_kernel in the graph node.
//
// After the build, exits[] contains the exit node ID(s) and expected[] holds
// the correct value for every node.  Only the exit values are checked.
struct VerifyCtx {
  int*       dev_buf      = nullptr;  // device buffer (caller-owned)
  int        next_id      = 0;        // auto-incremented as nodes are added
  long long  delay_cycles = 0;        // per-node busy-wait (race amplifier)

  std::vector<int>                             expected;    // expected[nodeId]
  std::vector<int>                             exits;       // exit node IDs
  std::unordered_map<hipGraphNode_t, int>      node_to_id;  // handle -> ID

  // Stable heap storage for per-node kernel args.
  // hipGraphAddKernelNode may store pointers into kernelParams rather than
  // copying values immediately
  // Each entry: {nodeId, d0, d1, d2, d3, ndeps} — must not reallocate after
  // pointers are handed to hipGraphAddKernelNode, so reserve(N) before build.
  std::vector<std::array<int, 6>> node_args;
};

// ---------------------------------------------------------------------------
// Node helper
// ---------------------------------------------------------------------------

// Add one kernel node to graph g depending on deps[0..ndeps-1].
//
// Non-verify mode (ctx == nullptr): wires null_kernel, returns -1.
//
// Verify mode: assigns the next ID, resolves predecessor handles to IDs via
// ctx->node_to_id, computes the expected value, sets up verify_kernel with
// the dep IDs as arguments, registers the new handle, and returns the ID.
// The caller is responsible for adding the returned ID to ctx->exits if this
// is an exit node.
static int add_node(hipGraph_t g, hipGraphNode_t* cur,
                    const hipGraphNode_t* deps, int ndeps,
                    VerifyCtx* ctx) {
  hipKernelNodeParams p{};
  p.gridDim  = {1, 1, 1};
  p.blockDim = {1, 1, 1};

  if (!ctx) {
    p.func = reinterpret_cast<void*>(null_kernel);
    HIP_CHECK(hipGraphAddKernelNode(cur, g, deps, ndeps, &p));
    return -1;
  }

  // Resolve predecessor handles to IDs and compute expected value on CPU.
  int d[4]  = {0, 0, 0, 0};
  int exp   = 1;
  for (int i = 0; i < ndeps && i < 4; ++i) {
    d[i] = ctx->node_to_id.at(deps[i]);
    exp += ctx->expected[d[i]];
  }

  int id = ctx->next_id++;
  ctx->expected.push_back(exp);

  // Store args in stable heap memory — hipGraphAddKernelNode may retain
  // pointers until hipGraphInstantiate rather than copying values immediately.
  ctx->node_args.push_back({id, d[0], d[1], d[2], d[3], ndeps});
  auto& sa = ctx->node_args.back();
  // delay_cycles is identical for all nodes; point every node at the single
  // stable copy held in ctx (valid through instantiate).
  void* args[] = {reinterpret_cast<void*>(&ctx->dev_buf),
                  reinterpret_cast<void*>(&sa[0]),
                  reinterpret_cast<void*>(&sa[1]),
                  reinterpret_cast<void*>(&sa[2]),
                  reinterpret_cast<void*>(&sa[3]),
                  reinterpret_cast<void*>(&sa[4]),
                  reinterpret_cast<void*>(&sa[5]),
                  reinterpret_cast<void*>(&ctx->delay_cycles)};
  p.func         = reinterpret_cast<void*>(verify_kernel);
  p.kernelParams = args;
  HIP_CHECK(hipGraphAddKernelNode(cur, g, deps, ndeps, &p));

  ctx->node_to_id[*cur] = id;  // register after hipGraphAddKernelNode sets *cur
  return id;
}

// ---------------------------------------------------------------------------
// Graph creators — build the hipGraph_t (node topology) without instantiating.
// Callers instantiate separately so that instantiation can be timed.
// ---------------------------------------------------------------------------

// straight: single linear chain of N nodes.
// Exit: the last node.
static hipGraph_t create_straight(int N, VerifyCtx* ctx = nullptr) {
  hipGraph_t g;
  HIP_CHECK(hipGraphCreate(&g, 0));

  hipGraphNode_t prev{}, cur{};
  int last_id = -1;
  for (int i = 0; i < N; ++i) {
    last_id = add_node(g, &cur, i == 0 ? nullptr : &prev, i == 0 ? 0 : 1, ctx);
    prev    = cur;
  }
  if (ctx && last_id >= 0) ctx->exits.push_back(last_id);
  return g;
}

// multi-path (hexagon): lead -> P parallel branches -> tail.
//   seg = N / (P + 2)  nodes per segment
//   total segments     = P + 2
//
// Ordering guarantee encoded in expected values:
//   join's expected = sum(branch_tail_expected[0..P-1]) + 1
// If any branch tail ran after the join, its buf slot is 0 at join time,
// making the join's actual value smaller than expected. That deficit
// propagates through the tail chain to the single exit node.
static hipGraph_t create_multi_path(int N, int P, VerifyCtx* ctx = nullptr) {
  const int seg = std::max(1, N / (P + 2));

  hipGraph_t g;
  HIP_CHECK(hipGraphCreate(&g, 0));

  // Leading straight chain.
  hipGraphNode_t prev{}, cur{};
  int last_id = -1;
  for (int i = 0; i < seg; ++i) {
    last_id = add_node(g, &cur, i == 0 ? nullptr : &prev, i == 0 ? 0 : 1, ctx);
    prev    = cur;
  }
  hipGraphNode_t split_node = prev;

  // P parallel branches, each rooted at split_node.
  // Dependency wiring uses node handles, so only the tail handle is tracked.
  std::vector<hipGraphNode_t> path_ends(P);
  for (int path = 0; path < P; ++path) {
    hipGraphNode_t pprev = split_node, pcur{};
    for (int i = 0; i < seg; ++i) {
      add_node(g, &pcur, &pprev, 1, ctx);
      pprev = pcur;
    }
    path_ends[path] = pprev;
  }

  // Join node: depends on all P branch tails.
  hipGraphNode_t join{};
  last_id = add_node(g, &join, path_ends.data(), P, ctx);
  prev    = join;

  // Trailing straight chain.
  for (int i = 1; i < seg; ++i) {
    last_id = add_node(g, &cur, &prev, 1, ctx);
    prev    = cur;
  }
  if (ctx && last_id >= 0) ctx->exits.push_back(last_id);
  return g;
}

static hipGraph_t create_paths2(int N, VerifyCtx* ctx = nullptr) {
  return create_multi_path(N, 2, ctx);
}
static hipGraph_t create_paths4(int N, VerifyCtx* ctx = nullptr) {
  return create_multi_path(N, 4, ctx);
}

// fully parallel: P independent chains of N/P nodes.
// No synchronisation point — GPU can schedule all chains concurrently.
// Exit: last node of each chain (P exits total).
static hipGraph_t create_full_parallel(int N, int P,
                                       VerifyCtx* ctx = nullptr) {
  const int seg = std::max(1, N / P);

  hipGraph_t g;
  HIP_CHECK(hipGraphCreate(&g, 0));

  for (int path = 0; path < P; ++path) {
    hipGraphNode_t pprev{}, pcur{};
    int            last_id = -1;
    for (int i = 0; i < seg; ++i) {
      last_id = add_node(g, &pcur,
                         i == 0 ? nullptr : &pprev, i == 0 ? 0 : 1, ctx);
      pprev   = pcur;
    }
    if (ctx && last_id >= 0) ctx->exits.push_back(last_id);
  }
  return g;
}

static hipGraph_t create_full2(int N, VerifyCtx* ctx = nullptr) {
  return create_full_parallel(N, 2, ctx);
}
static hipGraph_t create_full4(int N, VerifyCtx* ctx = nullptr) {
  return create_full_parallel(N, 4, ctx);
}

// ---------------------------------------------------------------------------
// Benchmark runners
// ---------------------------------------------------------------------------

static double bench(hipGraphExec_t exec, int iters, bool syncInTiming) {
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));

  for (int i = 0; i < 10; ++i) HIP_CHECK(hipGraphLaunch(exec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  Timer t;
  t.reserve(iters);
  for (int i = 0; i < iters; ++i) {
    t.start();
    HIP_CHECK(hipGraphLaunch(exec, stream));
    if (syncInTiming) HIP_CHECK(hipStreamSynchronize(stream));
    t.stop();
    HIP_CHECK(hipStreamSynchronize(stream));
  }

  HIP_CHECK(hipStreamDestroy(stream));
  return t.avg();
}

// Measures hipGraphInstantiate latency for a single cold instantiation.
// Returns the elapsed time in microseconds and the resulting hipGraphExec_t
// (caller owns it).
static double bench_instantiate(hipGraph_t graph, hipGraphExec_t* out) {
  Timer t;
  t.reserve(1);
  hipGraphExec_t e;
  t.start();
  HIP_CHECK(hipGraphInstantiate(&e, graph, nullptr, nullptr, 0));
  t.stop();
  *out = e;
  return t.avg();
}

// ---------------------------------------------------------------------------
// Verification runner
// ---------------------------------------------------------------------------

// Builds the graph with verify_kernel nodes and repeatedly launches it,
// resetting the device buffer before each launch.  After every launch the
// full buffer is copied back and *all* node values are compared against the
// CPU-computed expected values (not just the exits), which both catches
// ordering violations that do not propagate to an exit and pinpoints where
// the violation occurred.
//
// Running multiple iterations is important because ordering bugs are
// nondeterministic; a single launch may pass by luck.  Each verify_kernel
// also busy-waits delay_cycles before writing its result, widening the window
// in which an out-of-order successor would observe a stale 0.
//
// *out_nexits is set to the number of exit nodes found.
static bool verify(hipGraph_t (*create)(int, VerifyCtx*), int N,
                   const char* name, long long delay_cycles, int iters,
                   int* out_nexits) {
  VerifyCtx ctx;
  ctx.delay_cycles = delay_cycles;
  // Reserve N slots upfront so node_args never reallocates while
  // hipGraphAddKernelNode holds pointers into it.
  ctx.node_args.reserve(N);
  ctx.expected.reserve(N);
  // Over-allocate: actual node count (after integer-division seg rounding)
  // may be slightly less than N, but never more.
  HIP_CHECK(hipMalloc(&ctx.dev_buf, N * sizeof(int)));

  hipGraph_t g = create(N, &ctx);
  hipGraphExec_t exec;
  HIP_CHECK(hipGraphInstantiate(&exec, g, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphDestroy(g));

  *out_nexits = static_cast<int>(ctx.exits.size());
  const int total = ctx.next_id;

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));

  std::vector<int> host(total);
  bool pass     = true;
  int  reported = 0;
  for (int it = 0; it < iters && pass; ++it) {
    HIP_CHECK(hipMemset(ctx.dev_buf, 0, total * sizeof(int)));
    HIP_CHECK(hipGraphLaunch(exec, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
    HIP_CHECK(hipMemcpy(host.data(), ctx.dev_buf, total * sizeof(int),
                        hipMemcpyDeviceToHost));

    for (int id = 0; id < total; ++id) {
      if (host[id] != ctx.expected[id]) {
        if (reported < 8) {
          fprintf(stderr,
                  "  [%s] FAIL iter %d node %d: got %d, expected %d "
                  "(ordering violation)\n",
                  name, it, id, host[id], ctx.expected[id]);
          ++reported;
        }
        pass = false;
      }
    }
  }

  HIP_CHECK(hipStreamDestroy(stream));
  HIP_CHECK(hipGraphExecDestroy(exec));
  HIP_CHECK(hipFree(ctx.dev_buf));
  return pass;
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

int main(int argc, char* argv[]) {
  int         size         = 1024;
  int         iters        = 1000;
  bool        syncInTiming = false;
  bool        sweep        = false;
  bool        do_verify    = false;
  bool        measure_inst = false;
  int         verify_iters = 50;
  int         verify_delay_us = 1;
  std::string topo         = "all";

  for (int i = 1; i < argc; ++i) {
    if ((!strcmp(argv[i], "--size") || !strcmp(argv[i], "--graphSize")) &&
        i + 1 < argc)
      size = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--iters") && i + 1 < argc)
      iters = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--no-sync"))
      syncInTiming = false;
    else if (!strcmp(argv[i], "--sync"))
      syncInTiming = true;
    else if (!strcmp(argv[i], "--sweep"))
      sweep = true;
    else if (!strcmp(argv[i], "--topology") && i + 1 < argc)
      topo = argv[++i];
    else if (!strcmp(argv[i], "--instantiate"))
      measure_inst = true;
    else if (!strcmp(argv[i], "--verify"))
      do_verify = true;
    else if (!strcmp(argv[i], "--verify-iters") && i + 1 < argc)
      verify_iters = atoi(argv[++i]);
    else if (!strcmp(argv[i], "--verify-delay-us") && i + 1 < argc)
      verify_delay_us = atoi(argv[++i]);
  }

  int deviceId;
  HIP_CHECK(hipGetDevice(&deviceId));
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
  printf("Device : %s\n", props.name);

  struct Topo {
    const char*     name;
    hipGraph_t    (*create)(int, VerifyCtx*);
  };

  const Topo topos[] = {
      {"straight", create_straight},
      {"paths2",   create_paths2},
      {"paths4",   create_paths4},
      {"full2",    create_full2},
      {"full4",    create_full4},
  };
  const int ntopos = static_cast<int>(sizeof(topos) / sizeof(topos[0]));

  // Build the list of selected topology indices based on --topology.
  std::vector<int> sel;
  for (int t = 0; t < ntopos; ++t) {
    if (topo == "all" || topo == topos[t].name) sel.push_back(t);
  }
  const int nsel = static_cast<int>(sel.size());

  if (nsel == 0) {
    fprintf(stderr, "Unknown topology '%s'.  Available:", topo.c_str());
    for (int t = 0; t < ntopos; ++t) fprintf(stderr, " %s", topos[t].name);
    fprintf(stderr, "\n");
    return 1;
  }

  // -------------------------------------------------------------------------
  // Verification mode
  // -------------------------------------------------------------------------
  if (do_verify) {
    // Convert the requested per-node delay (microseconds) to device clock
    // cycles for clock64().  props.clockRate is in kHz, so cycles-per-us is
    // clockRate/1000.  Fall back to ~1.7 GHz if the runtime reports 0.
    const long long cyc_per_us =
        props.clockRate > 0 ? props.clockRate / 1000 : 1700;
    const long long delay_cycles =
        static_cast<long long>(verify_delay_us) * cyc_per_us;

    printf("Mode   : verify (ordering check, size=%d, iters=%d, "
           "delay=%dus/node)\n\n",
           size, verify_iters, verify_delay_us);
    printf("%-10s  %-6s  %s\n", "topology", "exits", "result");
    printf("%s\n", std::string(32, '-').c_str());

    bool all_pass = true;
    for (int si = 0; si < nsel; ++si) {
      const auto& tp = topos[sel[si]];
      int  nexits = 0;
      const bool pass =
          verify(tp.create, size, tp.name, delay_cycles, verify_iters,
                 &nexits);
      printf("%-10s  %-6d  %s\n", tp.name, nexits,
             pass ? "PASS" : "FAIL");
      all_pass &= pass;
    }
    return all_pass ? 0 : 1;
  }

  // -------------------------------------------------------------------------
  // Benchmark mode
  // -------------------------------------------------------------------------
  printf("Mode   : %s\n",
         syncInTiming ? "sync (submission+GPU)" : "no-sync (submission only)");
  printf("Iters  : %d per measurement\n", iters);
  if (measure_inst) printf("Metrics: instantiate + launch\n");
  printf("\n");

  if (sweep) {
    const int sweep_sizes[] = {1,   2,   4,    8,    16,   32,
                                64,  128, 256,  512,  1024, 2048,
                                4096, 8192};
    const int nsizes =
        static_cast<int>(sizeof(sweep_sizes) / sizeof(sweep_sizes[0]));

    // ----- Instantiation sweep table -----
    if (measure_inst) {
      printf("--- instantiate (us) ---\n");
      printf("%-7s", "size");
      for (int si = 0; si < nsel; ++si)
        printf("  %10s", topos[sel[si]].name);
      printf("\n%s\n", std::string(7 + nsel * 12, '-').c_str());

      for (int s = 0; s < nsizes; ++s) {
        const int N = sweep_sizes[s];
        printf("%-7d", N);
        for (int si = 0; si < nsel; ++si) {
          hipGraph_t g = topos[sel[si]].create(N, nullptr);
          hipGraphExec_t e;
          const double avg = bench_instantiate(g, &e);
          HIP_CHECK(hipGraphExecDestroy(e));
          HIP_CHECK(hipGraphDestroy(g));
          printf("  %9.3f us", avg);
        }
        printf("\n");
        fflush(stdout);
      }
      printf("\n");
    }

    // ----- Launch sweep table -----
    printf("--- launch (us) ---\n");
    printf("%-7s", "size");
    for (int si = 0; si < nsel; ++si)
      printf("  %10s", topos[sel[si]].name);
    printf("\n%s\n", std::string(7 + nsel * 12, '-').c_str());

    for (int s = 0; s < nsizes; ++s) {
      const int N = sweep_sizes[s];
      printf("%-7d", N);
      for (int si = 0; si < nsel; ++si) {
        hipGraph_t g = topos[sel[si]].create(N, nullptr);
        hipGraphExec_t e;
        HIP_CHECK(hipGraphInstantiate(&e, g, nullptr, nullptr, 0));
        HIP_CHECK(hipGraphDestroy(g));
        const double avg = bench(e, iters, syncInTiming);
        HIP_CHECK(hipGraphExecDestroy(e));
        printf("  %9.3f us", avg);
      }
      printf("\n");
      fflush(stdout);
    }

  } else {
    // ----- Single-size mode -----
    if (measure_inst) {
      printf("%-10s  %12s  %12s\n", "topology", "inst (us)", "launch (us)");
      printf("%s\n", std::string(38, '-').c_str());
      for (int si = 0; si < nsel; ++si) {
        const auto& tp = topos[sel[si]];
        hipGraph_t g = tp.create(size, nullptr);
        hipGraphExec_t e;
        const double inst_avg = bench_instantiate(g, &e);
        HIP_CHECK(hipGraphDestroy(g));
        const double launch_avg = bench(e, iters, syncInTiming);
        HIP_CHECK(hipGraphExecDestroy(e));
        printf("%-10s  %9.3f     %9.3f\n", tp.name, inst_avg, launch_avg);
      }
    } else {
      printf("%-10s  %s\n", "topology", "avg (us)");
      printf("%s\n", std::string(30, '-').c_str());
      for (int si = 0; si < nsel; ++si) {
        const auto& tp = topos[sel[si]];
        hipGraph_t g = tp.create(size, nullptr);
        hipGraphExec_t e;
        HIP_CHECK(hipGraphInstantiate(&e, g, nullptr, nullptr, 0));
        HIP_CHECK(hipGraphDestroy(g));
        const double avg = bench(e, iters, syncInTiming);
        HIP_CHECK(hipGraphExecDestroy(e));
        printf("%-10s  %.3f us\n", tp.name, avg);
      }
    }
  }

  return 0;
}