From 06e76586acb71b5f17940e1e1cafd0d44ee03a3c Mon Sep 17 00:00:00 2001 From: sreerohi Date: Wed, 20 May 2026 23:27:01 -0500 Subject: [PATCH] ci: disable gradient accumulation fusion for GLM-4-9B on ROCm TE fused wgrad GEMM fails with 'Unable to find any suitable algorithms' on MI350. Add --no-gradient-accumulation-fusion gated on ROCm. --- tests/e2e/megatron/test_quick_start_glm4_9B.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/e2e/megatron/test_quick_start_glm4_9B.py b/tests/e2e/megatron/test_quick_start_glm4_9B.py index b9d67b0e71..f8c031661f 100644 --- a/tests/e2e/megatron/test_quick_start_glm4_9B.py +++ b/tests/e2e/megatron/test_quick_start_glm4_9B.py @@ -1,5 +1,7 @@ import os +import torch + from tests.ci.ci_register import register_cuda_ci import miles.utils.external_utils.command_utils as U @@ -8,6 +10,7 @@ ENABLE_EVAL = U.get_bool_env_var("MILES_TEST_ENABLE_EVAL", "1") TIGHT_DEVICE_MEMORY = U.get_bool_env_var("MILES_TEST_TIGHT_DEVICE_MEMORY", "1") +IS_ROCM = hasattr(torch.version, "hip") and torch.version.hip is not None MODEL_NAME = "GLM-Z1-9B-0414" MODEL_TYPE = "glm4-9B" @@ -103,6 +106,9 @@ def execute(): "--rollout-num-gpus 4 " ) + if IS_ROCM: + misc_args += "--no-gradient-accumulation-fusion " + train_args = ( f"{ckpt_args} " f"{rollout_args} "