From 2e3874844445bd45abc3a663a26e54aae420dd30 Mon Sep 17 00:00:00 2001
From: "Heyuan.Hu-M4" <heyuan.hu0623@gmail.com>
Date: Mon, 13 Apr 2026 16:34:27 +0800
Subject: [PATCH] add bf16 support for paddleocr-vl

---
 paddle/phi/backends/gpu/rocm/miopen_desc.h    |  3 +
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  9 ++-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |  6 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |  9 ++-
 test/legacy_test/test_conv2d_op.py            | 66 +++++++++++++++----
 5 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h
index 15276c61ef8ddb..b2119d16c4a88f 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_desc.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h
@@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) {
     case DataType::FLOAT32:
       type = miopenFloat;
       break;
+    case DataType::BFLOAT16:
+      type = miopenBFloat16;
+      break;
     default:
       break;
   }
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 1e0c2c92a3766c..447362c6b6f95a 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -786,8 +786,13 @@ template PADDLE_API void LayerNormKernel<double, GPUContext>(
 
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-PD_REGISTER_KERNEL(
-    layer_norm, GPU, ALL_LAYOUT, phi::LayerNormKernel, float, phi::float16) {
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index ac390524b19a45..385726e413d34b 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1443,7 +1443,8 @@ PD_REGISTER_KERNEL(conv2d_grad,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_grad,
                    GPUDNN,
@@ -1456,7 +1457,8 @@ PD_REGISTER_KERNEL(conv2d_double_grad,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_double_grad,
                    GPUDNN,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index fcc1a2fff7029a..1c05fc884a17a1 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -561,8 +561,13 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(
-    conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {}
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(
     conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {}
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index 170757dd57b5fe..320c7a9f968123 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -152,7 +152,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 def create_test_cudnn_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNCase(parent):
@@ -171,7 +175,11 @@ def init_kernel_type(self):
 
 def create_test_cudnn_fp16_class(parent, grad_check=True):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestConv2DCUDNNFp16(parent):
@@ -206,7 +214,11 @@ def test_check_grad_no_input(self):
 
 def create_test_cudnn_bf16_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device())
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        )
         or not core.is_bfloat16_supported(get_device_place()),
         "core is not compiled with CUDA and do not support bfloat16",
     )
@@ -273,7 +285,11 @@ def init_test_case_2(self):
 
 def create_test_cudnn_channel_last_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCudnnChannelLastCase(parent):
@@ -281,7 +297,11 @@ def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )
 
@@ -299,7 +319,11 @@ def init_test_case_2(self):
 
 def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCudnnChannelLastFp16(parent):
@@ -308,7 +332,11 @@ def init_kernel_type(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            if core.is_compiled_with_cuda() or is_custom_device():
+            if (
+                core.is_compiled_with_cuda()
+                or core.is_compiled_with_rocm()
+                or is_custom_device()
+            ):
                 place = get_device_place()
                 if core.is_float16_supported(place):
                     self.check_output_with_place(place, atol=2e-2)
@@ -363,7 +391,11 @@ def init_paddings(self):
 
 def create_test_cudnn_padding_SAME_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNPaddingSAMECase(parent):
@@ -371,7 +403,11 @@ def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )
 
@@ -386,7 +422,11 @@ def init_paddings(self):
 
 def create_test_cudnn_padding_VALID_class(parent):
     @unittest.skipIf(
-        not (core.is_compiled_with_cuda() or is_custom_device()),
+        not (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_rocm()
+            or is_custom_device()
+        ),
         "core is not compiled with CUDA",
     )
     class TestCUDNNPaddingVALIDCase(parent):
@@ -394,7 +434,11 @@ def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = (
                 np.float32
-                if (core.is_compiled_with_rocm() or is_custom_device())
+                if (
+                    core.is_compiled_with_cuda()
+                    or core.is_compiled_with_rocm()
+                    or is_custom_device()
+                )
                 else np.float64
             )