From 2e3874844445bd45abc3a663a26e54aae420dd30 Mon Sep 17 00:00:00 2001 From: "Heyuan.Hu-M4" Date: Mon, 13 Apr 2026 16:34:27 +0800 Subject: [PATCH] add bf16 support for paddleocr-vl --- paddle/phi/backends/gpu/rocm/miopen_desc.h | 3 + paddle/phi/kernels/gpu/layer_norm_kernel.cu | 9 ++- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 6 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 9 ++- test/legacy_test/test_conv2d_op.py | 66 +++++++++++++++---- 5 files changed, 76 insertions(+), 17 deletions(-) diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h index 15276c61ef8ddb..b2119d16c4a88f 100644 --- a/paddle/phi/backends/gpu/rocm/miopen_desc.h +++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h @@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) { case DataType::FLOAT32: type = miopenFloat; break; + case DataType::BFLOAT16: + type = miopenBFloat16; + break; default: break; } diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 1e0c2c92a3766c..447362c6b6f95a 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -786,8 +786,13 @@ template PADDLE_API void LayerNormKernel( #ifdef PADDLE_WITH_HIP // MIOPEN do not support double -PD_REGISTER_KERNEL( - layer_norm, GPU, ALL_LAYOUT, phi::LayerNormKernel, float, phi::float16) { +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + phi::float16, + phi::bfloat16) { kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index ac390524b19a45..385726e413d34b 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -1443,7 +1443,8 @@ PD_REGISTER_KERNEL(conv2d_grad, ALL_LAYOUT, phi::ConvCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_grad, GPUDNN, @@ -1456,7 +1457,8 @@ PD_REGISTER_KERNEL(conv2d_double_grad, ALL_LAYOUT, phi::ConvCudnnGradGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index fcc1a2fff7029a..1c05fc884a17a1 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -561,8 +561,13 @@ void Conv3DCudnnKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL( - conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL( conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {} diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py index 170757dd57b5fe..320c7a9f968123 100644 --- a/test/legacy_test/test_conv2d_op.py +++ b/test/legacy_test/test_conv2d_op.py @@ -152,7 +152,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): def create_test_cudnn_class(parent): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestCUDNNCase(parent): @@ -171,7 +175,11 @@ def init_kernel_type(self): def create_test_cudnn_fp16_class(parent, grad_check=True): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestConv2DCUDNNFp16(parent): @@ -206,7 +214,11 @@ def test_check_grad_no_input(self): def create_test_cudnn_bf16_class(parent): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()) + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ) or not core.is_bfloat16_supported(get_device_place()), "core is not compiled with CUDA and do not support bfloat16", ) @@ -273,7 +285,11 @@ def init_test_case_2(self): def create_test_cudnn_channel_last_class(parent): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestCudnnChannelLastCase(parent): @@ -281,7 +297,11 @@ def init_kernel_type(self): self.use_cudnn = True self.dtype = ( np.float32 - if (core.is_compiled_with_rocm() or is_custom_device()) + if ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ) else np.float64 ) @@ -299,7 +319,11 @@ def init_test_case_2(self): def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestCudnnChannelLastFp16(parent): @@ -308,7 +332,11 @@ def init_kernel_type(self): self.dtype = np.float16 def test_check_output(self): - if core.is_compiled_with_cuda() or is_custom_device(): + if ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ): place = get_device_place() if core.is_float16_supported(place): self.check_output_with_place(place, atol=2e-2) @@ -363,7 +391,11 @@ def init_paddings(self): def create_test_cudnn_padding_SAME_class(parent): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestCUDNNPaddingSAMECase(parent): @@ -371,7 +403,11 @@ def init_kernel_type(self): self.use_cudnn = True self.dtype = ( np.float32 - if (core.is_compiled_with_rocm() or is_custom_device()) + if ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ) else np.float64 ) @@ -386,7 +422,11 @@ def init_paddings(self): def create_test_cudnn_padding_VALID_class(parent): @unittest.skipIf( - not (core.is_compiled_with_cuda() or is_custom_device()), + not ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ), "core is not compiled with CUDA", ) class TestCUDNNPaddingVALIDCase(parent): @@ -394,7 +434,11 @@ def init_kernel_type(self): self.use_cudnn = True self.dtype = ( np.float32 - if (core.is_compiled_with_rocm() or is_custom_device()) + if ( + core.is_compiled_with_cuda() + or core.is_compiled_with_rocm() + or is_custom_device() + ) else np.float64 )