ROCm · CharlieL7 · May 8, 2026 · May 11, 2026 · May 12, 2026 · May 13, 2026
@@ -26,6 +26,8 @@ Full documentation for MIGraphX is available at
 * Added N-D scale and zero-point support for `QLinearMatMul` operator.
 * Added test cases for `QLinearConv` per-channel scale and `QLinearMatMul` N-D per-channel quantization.
 * Added find_concat_same_input matcher to convert concat(N*x) into multibroadcast(x) to reduce hipCopy() (#4981)
+* Added GPU kernel for ONNX `NonMaxSuppression` operation and redesigned the `nonmaxsuppression` operation to better represent the data-dependent output shape in the MIGraphX IR (#4893).
+
 ### Changed
 
 * Converted `nonzero` operator from device implementation to JIT compilation (#4720).
@@ -68,6 +70,7 @@ Full documentation for MIGraphX is available at
 
 ### Removed
 * Removed legacy device implementations for `argmin` and `argmax` in favor of the JIT implementations recently added (#4658).
+* Removed `onnx_options::use_dyn_output` after redesign of `NonMaxSuppression` operator (#4893).
 
 ## MIGraphX 2.15 for ROCm 7.2.0
 

@@ -511,15 +511,15 @@ Operator Support Matrix
 +--------------------------+-----------+-----------------+------------------------------+
 | NegativeLogLikelihoodLoss| ❌        |                 |                              |
 +--------------------------+-----------+-----------------+------------------------------+
-| NonMaxSuppression        | ✅        | FP8, FP16,      | fixed output                 |
-|                          |           | FP32, FP64      | size unless                  |
-|                          |           |                 | ``use_dyn_output``           |
-|                          |           |                 | set                          |
-+--------------------------+-----------+-----------------+------------------------------+
-| NonZero                  | ✅        | FP8, FP16,      | fixed output                 |
-|                          |           | FP32, FP64      | size unless                  |
-|                          |           |                 | ``use_dyn_output``           |
-|                          |           |                 | set                          |
+| NonMaxSuppression        | ✅        | FP8, FP16,      |                              |
+|                          |           | FP32, FP64      |                              |
+|                          |           |                 |                              |
+|                          |           |                 |                              |
++--------------------------+-----------+-----------------+------------------------------+
+| NonZero                  | ✅        | FP8, FP16,      | fixed output size            |
+|                          |           | FP32, FP64      |                              |
+|                          |           |                 |                              |
+|                          |           |                 |                              |
 +--------------------------+-----------+-----------------+------------------------------+
 | Not                      | ✅        | BOOL            |                              |
 +--------------------------+-----------+-----------------+------------------------------+

@@ -298,6 +298,14 @@ Model performance tunable variables change the compilation behavior of a model.
 
       | Default: Full dynamic shape support is disabled.
 
+  * - | ``MIGRAPHX_USE_DYNAMIC_NMS``
+      | When set, the ``NonMaxSuppression`` ONNX parser performs a dynamic slice on the raw indices tensor to trim it to the number of selected boxes, producing an output with a dynamic shape.
+
+    - | ``1``: A dynamic slice is applied to the raw indices tensor, producing a dynamic-shaped output.
+      | ``0``: Returns to default behavior.
+
+      | Default: The whole raw indices tensor is returned without slicing.
+
 Matching
 **********
 

@@ -56,8 +56,6 @@ struct onnx_options
     /// Since loop will become a tensor of max iter size a huge number can cause overflow during
     /// shape computations.
     int64_t limit_max_iterations = std::numeric_limits<uint16_t>::max();
-    /// Use dynamic output for operators when available
-    bool use_dyn_output = false;
     /// Parse in ONNX node names as debug symbols
     bool use_debug_symbols = false;
     /// Path to use for the external data if it is stored at different location compared to onnx

@@ -36,27 +36,32 @@
 #include <migraphx/tensor_view.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/shape.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/par.hpp>
 
-/*
-https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
-*/
+/**
+ *  nonmaxsuppression(boxes,
+ *                    scores,
+ *                    optional(max_output_boxes_per_class),
+ *                    optional(iou_threshold),
+ *                    optional(score_threshold));
+ *  Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t:
+ * num_selected_indices}
+ */
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
 
 struct nonmaxsuppression
 {
     bool center_point_box = false;
-    bool use_dyn_output   = false;
 
     template <class Self, class F>
     static auto reflect(Self& self, F f)
     {
-        return pack(f(self.center_point_box, "center_point_box"),
-                    f(self.use_dyn_output, "use_dyn_output"));
+        return pack(f(self.center_point_box, "center_point_box"));
     }
 
     std::string name() const { return "nonmaxsuppression"; }
@@ -69,8 +74,9 @@ struct nonmaxsuppression
         auto max_classes           = inputs.at(1).max_lens().at(1);
         auto max_spatial_dimension = inputs.at(0).max_lens().at(1);
         // Per ONNX spec, output is [num_selected_indices, 3] where each row is
-        // [batch_index, class_index, box_index].  The maximum possible
+        // [batch_index, class_index, box_index]. The maximum possible
         // num_selected_indices = num_batches * num_classes * spatial_dimension.
+        // TODO: can also be limited by max_output_boxes_per_class
         const auto max_num_boxes = max_batches * max_classes * max_spatial_dimension;
 
         auto fixed_shape_error_check = [&]() {
@@ -87,21 +93,14 @@ struct nonmaxsuppression
             }
         };
 
-        bool needs_dyn_output = use_dyn_output or inputs.at(0).dynamic() or inputs.at(1).dynamic();
-
-        if(needs_dyn_output)
-        {
-            std::vector<shape::dynamic_dimension> out_lens = {};
-            out_lens.push_back({0, max_num_boxes});
-            out_lens.push_back({3, 3});
-            return {shape::int64_type, out_lens};
-        }
-        else
+        if(not(inputs.at(0).dynamic() or inputs.at(1).dynamic()))
         {
             fixed_shape_error_check();
-            std::vector<std::size_t> out_lens = {max_num_boxes, 3};
-            return {shape::int64_type, out_lens};
         }
+        std::vector<std::size_t> out_lens = {max_num_boxes, 3};
+        shape s_ind{shape::int64_type, out_lens};
+        shape s_num_selected{shape::int64_type, {1}};
+        return shape({s_ind, s_num_selected});
     }
 
     struct box
@@ -190,7 +189,8 @@ struct nonmaxsuppression
         return intersection_over_union > iou_threshold;
     }
 
-    // filter boxes below score_threshold
+    // Filter boxes below score_threshold.
+    // Don't filter for score if score_threshold == 0.f
     template <class T>
     std::vector<std::pair<double, int64_t>>
     filter_boxes_by_score(T scores_start, std::size_t num_boxes, double score_threshold) const
@@ -232,10 +232,11 @@ struct nonmaxsuppression
     std::size_t compute_nms(Output output,
                             const Boxes& boxes,
                             const Scores& scores,
-                            std::size_t max_output_boxes_per_class,
+                            int64_t max_output_boxes_per_class,
                             double iou_threshold,
                             double score_threshold) const
     {
+        // NOTE: should not need to fill with 0
         std::fill(output.begin(), output.end(), 0);
         const auto& lens       = scores.get_shape().lens();
         const auto num_batches = lens[0];
@@ -302,14 +303,16 @@ struct nonmaxsuppression
     argument compute(const shape& output_shape, std::vector<argument> args) const
     {
         // make buffer of maximum size
-        shape max_output_shape = {output_shape.type(), output_shape.max_lens()};
+        auto output_shapes     = flatten_shapes({output_shape});
+        shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()};
         argument result{max_output_shape};
+        argument num_selected_result{output_shapes.at(1)};
 
-        std::size_t max_output_boxes_per_class =
-            (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
+        int64_t max_output_boxes_per_class = (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
         if(max_output_boxes_per_class == 0)
         {
-            return result;
+            num_selected_result.visit([&](auto output) { output[0] = 0; });
+            return {{result, num_selected_result}};
         }
         double iou_threshold     = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
         double score_threshold   = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
@@ -325,14 +328,8 @@ struct nonmaxsuppression
                                            score_threshold);
             });
         });
-        if(output_shape.dynamic())
-        {
-            return result.reshape({output_shape.type(), {num_selected, 3}});
-        }
-        else
-        {
-            return result;
-        }
+        num_selected_result.visit([&](auto output) { output[0] = num_selected; });
+        return {{result, num_selected_result}};
     }
 };
 

@@ -621,7 +621,7 @@ struct MIGRAPHX_EXPORT shape
 };
 
 /// Flatten subshapes to a single vector of non-tuple type of shapes
-MIGRAPHX_EXPORT std::vector<shape> flatten(const std::vector<shape>& shapes);
+MIGRAPHX_EXPORT std::vector<shape> flatten_shapes(const std::vector<shape>& shapes);
 
 MIGRAPHX_EXPORT void migraphx_to_value(value& v, const shape& s);
 MIGRAPHX_EXPORT void migraphx_from_value(const value& v, shape& s);

@@ -102,7 +102,6 @@ struct onnx_parser
     std::unordered_map<std::string, std::vector<std::size_t>> map_input_dims;
     std::unordered_map<std::string, shape::dynamic_dimension> dim_params;
     std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
-    bool use_dyn_output          = false;
     bool skip_unknown_operators  = false;
     bool use_debug_symbols       = false;
     int64_t max_loop_iterations  = 10;

@@ -72,7 +72,6 @@ static program parse_onnx_from(const onnx_options& options, Ts&&... xs)
     parser.skip_unknown_operators = options.skip_unknown_operators;
     parser.max_loop_iterations    = options.max_loop_iterations;
     parser.limit_max_iterations   = options.limit_max_iterations;
-    parser.use_dyn_output         = options.use_dyn_output;
 
     if(options.print_program_on_error)
     {

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,9 @@
 #include <migraphx/onnx/op_parser.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/env.hpp>
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS)
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -38,9 +41,23 @@
                           const onnx_parser::node_info& info,
                           const std::vector<instruction_ref>& args) const
     {
-        auto op = parser.load(opd.op_name, info);
-        op.from_value({{"use_dyn_output", parser.use_dyn_output}});
-        return info.add_instruction(op, args);
+        auto op      = parser.load(opd.op_name, info);
+        auto nms_ins = info.add_instruction(op, args);
+        // slice with variable ends to handle dynamic shape output.
+        auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
+        if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{}))
+        {
+            // TODO: planning to make this the default behavior and removing the env var.
+            auto num_selected =
+                info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
+            auto slice_ins = info.add_instruction(
+                make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
+            return slice_ins;
+        }
+        else
+        {
+            return indices;
+        }
     }
 };
 

@@ -1380,14 +1380,14 @@ const std::vector<shape>& shape::sub_shapes() const { return impl->m_shapes; }
 
 void shape::debug_print() const { std::cout << *this << std::endl; }
 
-std::vector<shape> flatten(const std::vector<shape>& shapes)
+std::vector<shape> flatten_shapes(const std::vector<shape>& shapes)
 {
     std::vector<shape> result;
     for(const auto& s : shapes)
     {
         if(s.type() == shape::tuple_type)
         {
-            auto subs = flatten(s.sub_shapes());
+            auto subs = flatten_shapes(s.sub_shapes());
             result.insert(result.end(), subs.begin(), subs.end());
         }
         else

@@ -179,6 +179,7 @@ add_library(migraphx_gpu
     loop.cpp
     lrn.cpp
     mlir.cpp
+    nms_ops.cpp
     no_device.cpp
     pack_args.cpp
     prefuse_ops.cpp

@@ -41,7 +41,7 @@ shape code_object_op::compute_shape(std::vector<shape> inputs) const
     std::transform(einputs.begin(), einputs.end(), einputs.begin(), [](const shape& s) {
         return s.normalize_standard();
     });
-    if(not migraphx::equal(flatten(einputs), flatten(inputs), &shape::is_compatible))
+    if(not migraphx::equal(flatten_shapes(einputs), flatten_shapes(inputs), &shape::is_compatible))
         MIGRAPHX_THROW("Input shapes have changed: [" + to_string_range(einputs) + "] -> [" +
                        to_string_range(inputs) + "]");
     auto output_buffer_shape = inputs.at(get_output_arg(inputs.size()));

@@ -192,6 +192,8 @@ compute_global_for(const context& ctx, std::size_t n, std::size_t over)
     };
 }
 
+// `n`: The amount of parallel work within a block.
+// `max_block_size`: Upper limit on block size.
 std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t max_block_size)
 {
     const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size();

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +33,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 
+// Inclusive prefix sum within a kernel block.
+// Hillis-Steele scan with double-buffered (ping-pong) shared array.
+// `N`: upper bound on blockDim.x, sizes the shared buffer.
+// `op`: associative binary reduce function ex. sum or max.
+// `init`: initializer
+// `fs`: striding function for thread work distribution.
+// `input`: input with input(index_int).
+// `output`: output with output(index_int, inclusive_scan_value_at_index_int).
 template <index_int N,
           class Op,
           class T,
@@ -72,6 +80,7 @@ __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input,
     });
 }
 
+// Overload of block_scan with default local_stride up to `n`.
 template <index_int N, class Op, class T, class Input, class Output>
 __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
 {

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,9 @@ struct context;
 
 struct hip_compile_options
 {
+    // Total number of threads
     std::size_t global;
+    // Threads per block
     std::size_t local;
     std::vector<shape> inputs;
     shape output;