ROCm · paperybox · Apr 13, 2026
diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h
@@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) {
     case DataType::FLOAT32:
       type = miopenFloat;
       break;
+    case DataType::BFLOAT16:
+      type = miopenBFloat16;
+      break;
     default:
       break;
   }

diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -786,8 +786,13 @@ template PADDLE_API void LayerNormKernel<double, GPUContext>(
 
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-PD_REGISTER_KERNEL(
-    layer_norm, GPU, ALL_LAYOUT, phi::LayerNormKernel, float, phi::float16) {
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }

diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1443,7 +1443,8 @@ PD_REGISTER_KERNEL(conv2d_grad,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_grad,
                    GPUDNN,
@@ -1456,7 +1457,8 @@ PD_REGISTER_KERNEL(conv2d_double_grad,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_double_grad,
                    GPUDNN,

diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -561,8 +561,13 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(
-    conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {}
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(
     conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {}

diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
@@ -152,7 +152,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 def create_test_cudnn_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNCase(parent):
@@ -171,7 +175,11 @@ def init_kernel_type(self):
 
 def create_test_cudnn_fp16_class(parent, grad_check=True):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestConv2DCUDNNFp16(parent):
@@ -206,7 +214,11 @@ def test_check_grad_no_input(self):
 
 def create_test_cudnn_bf16_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device())
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        )
         or not core.is_bfloat16_supported(get_device_place()),
         "core is not compiled with CUDA and do not support bfloat16",
     )
@@ -273,15 +285,23 @@ def init_test_case_2(self):
 
 def create_test_cudnn_channel_last_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCudnnChannelLastCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )
 
@@ -299,7 +319,11 @@ def init_test_case_2(self):
 
 def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCudnnChannelLastFp16(parent):
@@ -308,7 +332,11 @@ def init_kernel_type(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            if core.is_compiled_with_cuda() or is_custom_device():
+            if (
+                core.is_compiled_with_cuda()
+                or core.is_compiled_with_rocm()
+                or is_custom_device()
+            ):
                 place = get_device_place()
                 if core.is_float16_supported(place):
                     self.check_output_with_place(place, atol=2e-2)
@@ -363,15 +391,23 @@ def init_paddings(self):
 
 def create_test_cudnn_padding_SAME_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNPaddingSAMECase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )
 
@@ -386,15 +422,23 @@ def init_paddings(self):
 
 def create_test_cudnn_padding_VALID_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNPaddingVALIDCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )