Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddle/fluid/inference/api/paddle_pass_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -635,8 +635,12 @@ const std::vector<std::string> kPirGpuPasses{
// Operator fusion pass
"silu_fuse_pass",
"conv2d_bn_fuse_pass",
#ifdef PADDLE_WITH_CUDA
// conv2d_add(_act)_fuse_pass lower to the fused_conv2d_add_act op, which
// only has a cuDNN GPUDNN kernel. Skip on ROCm/HIP.
"conv2d_add_act_fuse_pass",
"conv2d_add_fuse_pass",
#endif
"embedding_eltwise_layernorm_fuse_pass",
"fused_rotary_position_embedding_pass",
"fused_flash_attn_pass",
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -349,4 +349,10 @@ std::unique_ptr<Pass> CreateConv2dAddActFusePass() {

} // namespace pir

// The fused_conv2d_add_act op this pass produces only has a cuDNN
// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends.
// Skip registration on ROCm/HIP to avoid applying the rewrite and later
// failing at kernel-dispatch time.
#ifdef PADDLE_WITH_CUDA
REGISTER_IR_PASS(conv2d_add_act_fuse_pass, Conv2dAddActFusePass);
#endif
6 changes: 6 additions & 0 deletions paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,10 @@ std::unique_ptr<Pass> CreateConv2dAddFusePass() {
}
} // namespace pir

// The fused_conv2d_add_act op this pass produces only has a cuDNN
// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends.
// Skip registration on ROCm/HIP to avoid applying the rewrite and later
// failing at kernel-dispatch time.
#ifdef PADDLE_WITH_CUDA
REGISTER_IR_PASS(conv2d_add_fuse_pass, Conv2dAddFusePass);
#endif
2 changes: 2 additions & 0 deletions paddle/fluid/pir/transforms/passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ USE_PIR_PASS(matmul_add_act_fuse_pass);
USE_PIR_PASS(silu_fuse_pass);
USE_PIR_PASS(fc_elementwise_layernorm_fuse_pass);
USE_PIR_PASS(conv2d_bn_fuse_pass);
#ifdef PADDLE_WITH_CUDA
USE_PIR_PASS(conv2d_add_fuse_pass);
USE_PIR_PASS(conv2d_add_act_fuse_pass);
#endif
USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass);
USE_PIR_PASS(add_norm_fuse_pass);
USE_PIR_PASS(group_norm_silu_fuse_pass);
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/backends/gpu/rocm/miopen_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) {
case DataType::FLOAT32:
type = miopenFloat;
break;
case DataType::BFLOAT16:
type = miopenBFloat16;
break;
default:
break;
}
Expand Down
15 changes: 10 additions & 5 deletions paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1443,34 +1443,39 @@ PD_REGISTER_KERNEL(conv2d_grad,
ALL_LAYOUT,
phi::ConvCudnnGradKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}

PD_REGISTER_KERNEL(conv3d_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}
PD_REGISTER_KERNEL(conv2d_double_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradGradKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}

PD_REGISTER_KERNEL(conv3d_double_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnDoubleGradKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}

PD_REGISTER_KERNEL(depthwise_conv2d_double_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvDoubleGradGPUDNNKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(conv2d_grad,
Expand Down
21 changes: 16 additions & 5 deletions paddle/phi/kernels/gpudnn/conv_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -561,18 +561,29 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
} // namespace phi

#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(
conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {}
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
phi::float16,
phi::bfloat16) {}

PD_REGISTER_KERNEL(
conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
phi::float16,
phi::bfloat16) {}

PD_REGISTER_KERNEL(depthwise_conv2d,
GPUDNN,
ALL_LAYOUT,
phi::DepthwiseConvCudnnKernel,
float,
phi::float16) {}
phi::float16,
phi::bfloat16) {}

#else
#if CUDNN_VERSION_MIN(8, 1, 0)
Expand Down
13 changes: 11 additions & 2 deletions paddle/phi/kernels/gpudnn/softmax_gpudnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -1325,7 +1325,7 @@ void LaunchKeMatrixSoftmaxForwardKernel(const GPUContext& dev_ctx,
<<<N, block_dim, 0, dev_ctx.stream()>>>(out, input, dim_size);
}

#if CUDNN_VERSION < 8100
#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION < 8100
template <>
inline void LaunchSoftmaxForwardCudnnKernel<phi::bfloat16>(
const GPUContext& dev_ctx,
Expand Down Expand Up @@ -2811,7 +2811,16 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
dim_log2);
}
} else {
if (dim >= MATRIX_SOFTMAX_THRESHOLD) {
bool use_matrix_kernel = dim >= MATRIX_SOFTMAX_THRESHOLD;
#ifdef PADDLE_WITH_HIP
// MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for
// miopenSoftmaxForward_V2 with miopenBFloat16. Route BF16 through the
// matrix softmax kernel for any dim that exceeds the warp-softmax cap.
if (std::is_same<T, phi::bfloat16>::value) {
use_matrix_kernel = true;
}
#endif
if (use_matrix_kernel) {
LaunchKeMatrixSoftmaxForwardKernel<T, IndexType, LogMode>(
dev_ctx, out_data, x.data<T>(), N, dim);
} else {
Expand Down