diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 2cfab1eaf1cf36..ad7f59b11f98ca 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -635,8 +635,12 @@ const std::vector kPirGpuPasses{ // Operator fusion pass "silu_fuse_pass", "conv2d_bn_fuse_pass", +#ifdef PADDLE_WITH_CUDA + // conv2d_add(_act)_fuse_pass lower to the fused_conv2d_add_act op, which + // only has a cuDNN GPUDNN kernel. Skip on ROCm/HIP. "conv2d_add_act_fuse_pass", "conv2d_add_fuse_pass", +#endif "embedding_eltwise_layernorm_fuse_pass", "fused_rotary_position_embedding_pass", "fused_flash_attn_pass", diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc index d81ef58c2eecd0..9adfd1282ea6e4 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc @@ -349,4 +349,10 @@ std::unique_ptr CreateConv2dAddActFusePass() { } // namespace pir +// The fused_conv2d_add_act op this pass produces only has a cuDNN +// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends. +// Skip registration on ROCm/HIP to avoid applying the rewrite and later +// failing at kernel-dispatch time. +#ifdef PADDLE_WITH_CUDA REGISTER_IR_PASS(conv2d_add_act_fuse_pass, Conv2dAddActFusePass); +#endif diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc index 475eb426e1de93..1c5ff1e52bfd9f 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc @@ -221,4 +221,10 @@ std::unique_ptr CreateConv2dAddFusePass() { } } // namespace pir +// The fused_conv2d_add_act op this pass produces only has a cuDNN +// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends. +// Skip registration on ROCm/HIP to avoid applying the rewrite and later +// failing at kernel-dispatch time. +#ifdef PADDLE_WITH_CUDA REGISTER_IR_PASS(conv2d_add_fuse_pass, Conv2dAddFusePass); +#endif diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h index 5b34175ac4f540..31f0e514cfa54f 100644 --- a/paddle/fluid/pir/transforms/passes.h +++ b/paddle/fluid/pir/transforms/passes.h @@ -34,8 +34,10 @@ USE_PIR_PASS(matmul_add_act_fuse_pass); USE_PIR_PASS(silu_fuse_pass); USE_PIR_PASS(fc_elementwise_layernorm_fuse_pass); USE_PIR_PASS(conv2d_bn_fuse_pass); +#ifdef PADDLE_WITH_CUDA USE_PIR_PASS(conv2d_add_fuse_pass); USE_PIR_PASS(conv2d_add_act_fuse_pass); +#endif USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass); USE_PIR_PASS(add_norm_fuse_pass); USE_PIR_PASS(group_norm_silu_fuse_pass); diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h index 15276c61ef8ddb..b2119d16c4a88f 100644 --- a/paddle/phi/backends/gpu/rocm/miopen_desc.h +++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h @@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) { case DataType::FLOAT32: type = miopenFloat; break; + case DataType::BFLOAT16: + type = miopenBFloat16; + break; default: break; } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index ac390524b19a45..fd0c69da3b1be6 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -1443,34 +1443,39 @@ PD_REGISTER_KERNEL(conv2d_grad, ALL_LAYOUT, phi::ConvCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv2d_double_grad, GPUDNN, ALL_LAYOUT, phi::ConvCudnnGradGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnDoubleGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index fcc1a2fff7029a..a6fc23d41e7f45 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -561,18 +561,29 @@ void Conv3DCudnnKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL( - conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} -PD_REGISTER_KERNEL( - conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d, GPUDNN, ALL_LAYOUT, phi::DepthwiseConvCudnnKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 4752513c483f56..c0c26949d47646 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -1325,7 +1325,7 @@ void LaunchKeMatrixSoftmaxForwardKernel(const GPUContext& dev_ctx, <<>>(out, input, dim_size); } -#if CUDNN_VERSION < 8100 +#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION < 8100 template <> inline void LaunchSoftmaxForwardCudnnKernel( const GPUContext& dev_ctx, @@ -2811,7 +2811,16 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx, dim_log2); } } else { - if (dim >= MATRIX_SOFTMAX_THRESHOLD) { + bool use_matrix_kernel = dim >= MATRIX_SOFTMAX_THRESHOLD; +#ifdef PADDLE_WITH_HIP + // MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for + // miopenSoftmaxForward_V2 with miopenBFloat16. Route BF16 through the + // matrix softmax kernel for any dim that exceeds the warp-softmax cap. + if (std::is_same::value) { + use_matrix_kernel = true; + } +#endif + if (use_matrix_kernel) { LaunchKeMatrixSoftmaxForwardKernel( dev_ctx, out_data, x.data(), N, dim); } else {