From 1aeff19f7343f6442d010580ee8f40cb1d4c22c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 10 May 2026 11:01:52 +0000 Subject: [PATCH] perf(qwen3-next): set expandable_segments on GB300 BF16/FP8_MX to fix OOM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The qwen3_next_80b_a3b GB300 BF16/FP8_MX perf config OOMs at first forward under MBS=4 with HybridEP and TE-scoped CUDA graphs (attn/moe_router/moe_preprocess). The pattern matches the existing H100 FP8_CS path: NCCL/HybridEP buffer allocations fragment the heap, and expandable_segments lets the allocator reclaim physical memory without disabling any NCCL algorithms or reducing the micro-batch size. This preserves MBS=4 (and therefore TFLOPs) instead of falling back to selective recompute or matching the GB200 BF16 sister config (MBS=2). Verified hypothesis (CI run TBD): triage container nightly.50761024 + anchor MCore + this MBridge HEAD on test qwen3_next_80b_a3b_gb300_bf16_50steps_perf, cluster lyris. Signed-off-by: oliver könig --- scripts/performance/perf_plugins.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 383c6eb1dc..45b1a0a47e 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -284,13 +284,17 @@ def _set_model_specific_environment_variables( model_family_name in ["qwen"] and model_recipe_name in ["qwen3_next_80b_a3b"] and train_task == "pretrain" - and gpu in ["h100"] - and compute_dtype == "fp8_cs" + and ( + (gpu == "h100" and compute_dtype == "fp8_cs") + or (gpu == "gb300" and compute_dtype in ["bf16", "fp8_mx"]) + ) ): # NCCL 2.29.7 increases memory pressure on H100, causing allocator # fragmentation OOM. expandable_segments lets the allocator reclaim # fragmented physical memory and avoids the OOM without disabling - # any NCCL algorithms. + # any NCCL algorithms. The GB300 BF16/FP8_MX path hits the same + # fragmentation pattern at MBS=4 under HybridEP + TE-scoped CUDA + # graphs (attn/moe_router/moe_preprocess), so the same fix applies. executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" if model_family_name in ["deepseek"]: