diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc index d81ef58c2eecd0..f1009eb7580fba 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc @@ -280,6 +280,14 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass { pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override { pir::RewritePatternSet ps(context); +#ifdef PADDLE_WITH_HIP + // fused_conv2d_add_act kernel is not implemented for ROCm/HIP. + // Returning an empty pattern set prevents the pass from generating + // FusedConv2dAddActOp nodes that have no kernel on ROCm, which would + // cause a runtime error. PaddleX used to work around this by calling + // config.delete_pass() on these passes; this guard makes that unnecessary. + return ps; +#endif auto conv2d_double_add_act_fuse_pattern = std::make_unique( context, diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc index 475eb426e1de93..dc3f380e608afe 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc @@ -180,6 +180,14 @@ class Conv2dAddFusePass : public pir::PatternRewritePass { pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override { pir::RewritePatternSet ps(context); +#ifdef PADDLE_WITH_HIP + // fused_conv2d_add_act kernel is not implemented for ROCm/HIP. + // Returning an empty pattern set prevents the pass from generating + // FusedConv2dAddActOp nodes that have no kernel on ROCm, which would + // cause a runtime error. PaddleX used to work around this by calling + // config.delete_pass() on these passes; this guard makes that unnecessary. + return ps; +#endif // cutlass related const std::unordered_set cutlass_sm = { 75, diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index de26ff4ffa92da..70cf677a618cd3 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -786,8 +786,14 @@ template PADDLE_API void LayerNormKernel( #ifdef PADDLE_WITH_HIP // MIOPEN do not support double -PD_REGISTER_KERNEL( - layer_norm, GPU, ALL_LAYOUT, phi::LayerNormKernel, float, phi::float16) { +// bfloat16 uses the generic custom HIP kernel (not MIOpen), so it works on ROCm +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + phi::float16, + phi::bfloat16) { kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); } diff --git a/test/legacy_test/test_layer_norm_bf16_hip.py b/test/legacy_test/test_layer_norm_bf16_hip.py new file mode 100644 index 00000000000000..e7c48f3defe7ce --- /dev/null +++ b/test/legacy_test/test_layer_norm_bf16_hip.py @@ -0,0 +1,143 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test that paddle.nn.LayerNorm correctly handles bfloat16 input on HIP/ROCm. + +Root cause fixed: + paddle/phi/kernels/gpu/layer_norm_kernel.cu: HIP PD_REGISTER_KERNEL did not + include phi::bfloat16. This test verifies the fix is present and working. +""" + +import unittest + +import numpy as np + +import paddle +import paddle.nn as nn + + +def _ref_layer_norm_fp32(x_np, weight_np, bias_np, eps=1e-5): + """NumPy reference: layer norm over last dimension.""" + mean = x_np.mean(axis=-1, keepdims=True) + var = x_np.var(axis=-1, keepdims=True) + y = (x_np - mean) / np.sqrt(var + eps) + if weight_np is not None: + y = y * weight_np + if bias_np is not None: + y = y + bias_np + return y + + +@unittest.skipUnless( + paddle.is_compiled_with_rocm() or paddle.is_compiled_with_cuda(), + "GPU test: requires CUDA or ROCm", +) +class TestLayerNormBF16HIP(unittest.TestCase): + """Verify LayerNorm kernel is registered for bfloat16 on HIP.""" + + def setUp(self): + self.place = paddle.CUDAPlace(0) + np.random.seed(42) + + def _run_layer_norm_bf16(self, shape, normalized_shape): + x_fp32 = np.random.randn(*shape).astype(np.float32) + weight_fp32 = np.ones(normalized_shape, dtype=np.float32) + bias_fp32 = np.zeros(normalized_shape, dtype=np.float32) + + # Reference: compute in fp32 + ref = _ref_layer_norm_fp32(x_fp32, weight_fp32, bias_fp32) + + # BF16 path + x_bf16 = paddle.to_tensor( + x_fp32, dtype=paddle.bfloat16, place=self.place + ) + ln = nn.LayerNorm(normalized_shape) + ln = ln.to(dtype=paddle.bfloat16) + + with paddle.amp.auto_cast(dtype='bfloat16'): + out_bf16 = ln(x_bf16) + + out_fp32 = out_bf16.cast(paddle.float32).numpy() + + # BF16 has ~2 decimal digits of precision; use generous tolerance + np.testing.assert_allclose( + out_fp32, ref, rtol=1e-2, atol=1e-2, + err_msg=f"LayerNorm BF16 output does not match FP32 reference " + f"(shape={shape}, normalized_shape={normalized_shape})", + ) + + def test_2d_last_dim(self): + """LayerNorm over last dim of a 2D tensor.""" + self._run_layer_norm_bf16((4, 64), (64,)) + + def test_3d_last_dim(self): + """LayerNorm over last dim of a 3D tensor (batch, seq, hidden).""" + self._run_layer_norm_bf16((2, 8, 128), (128,)) + + def test_4d_last_two_dims(self): + """LayerNorm over last two dims of a 4D tensor.""" + self._run_layer_norm_bf16((2, 4, 16, 16), (16, 16)) + + def test_bf16_output_dtype(self): + """Output dtype must be bfloat16 when input is bfloat16.""" + x = paddle.ones([2, 32], dtype=paddle.bfloat16) + ln = nn.LayerNorm(32).to(dtype=paddle.bfloat16) + out = ln(x) + self.assertEqual( + out.dtype, + paddle.bfloat16, + "LayerNorm output dtype should be bfloat16 when input is bfloat16", + ) + + def test_kernel_registered(self): + """Verify that calling layer_norm with bfloat16 does not raise + 'kernel not registered' error (the core fix for HIP).""" + x = paddle.randn([4, 64]).cast(paddle.bfloat16) + try: + out = paddle.nn.functional.layer_norm(x, [64]) + except Exception as e: + self.fail( + f"layer_norm raised an exception with bfloat16 input: {e}" + ) + + @unittest.skipUnless( + paddle.is_compiled_with_rocm(), + "ROCm-specific test", + ) + def test_rocm_bf16_snr(self): + """On ROCm, verify Signal-to-Noise ratio >= 30 dB vs FP32 reference.""" + np.random.seed(0) + x_fp32_np = np.random.randn(8, 256).astype(np.float32) + + x_fp32 = paddle.to_tensor(x_fp32_np, place=self.place) + ln_fp32 = nn.LayerNorm(256) + ref_out = ln_fp32(x_fp32).numpy() + + x_bf16 = x_fp32.cast(paddle.bfloat16) + ln_bf16 = nn.LayerNorm(256).to(dtype=paddle.bfloat16) + test_out = ln_bf16(x_bf16).cast(paddle.float32).numpy() + + signal_power = np.mean(ref_out ** 2) + noise_power = np.mean((ref_out - test_out) ** 2) + 1e-30 + snr_db = 10 * np.log10(signal_power / noise_power) + + self.assertGreaterEqual( + snr_db, 30.0, + f"LayerNorm BF16 SNR on ROCm too low: {snr_db:.1f} dB (expected >= 30 dB)", + ) + print(f"\n[ROCm BF16 LayerNorm SNR] {snr_db:.1f} dB — PASS") + + +if __name__ == '__main__': + unittest.main()