From b5e9fd7055bc5d3bd7e438ae8e2b507cd862fe0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 10 May 2026 11:15:44 +0000
Subject: [PATCH] fix(perf): halve micro_batch_size on b300/gb300 MoE perf
 configs to fit HybridEP IB QP cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVIDIA/Megatron-LM PR #4094 (commit a08e259f32) added a Python-side guardrail in
megatron/core/transformer/moe/fused_a2a.py that rejects HybridEP dispatch when
3*num_tokens + 1 >= 65536 (the InfiniBand RDMA QP-depth hardware limit). The 9
b300/gb300 perf tests below were tripping it because their per-rank num_tokens
landed at 32768 (cap is 21844).

Halving micro_batch_size brings num_tokens down to 16384 — comfortably below the
cap and matching the geometry the gb200 variants already use (which still pass).

Affected presets:
  - NEMOTRON_3_NANO_PRETRAIN_CONFIG_{GB300,B300}_{BF16,FP8_MX,NVFP4}_V1   MBS 4 -> 2
  - QWEN3_30B_A3B_PRETRAIN_CONFIG_{GB300,B300}_{BF16,FP8_CS,FP8_MX}_V1   MBS 8 -> 4
  - QWEN3_VL_30B_A3B_PRETRAIN_CONFIG_GB300_{BF16,FP8_CS,FP8_MX}          MBS 8 -> 4

Note: golden values for the corresponding nemo-ci tests will need to be
re-baselined once these new configs run cleanly — that's a follow-up MR in
nemo-ci, not part of this PR.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .../configs/nemotronh/nemotron_3_workload_base_configs.py | 4 ++--
 .../configs/qwen/qwen3_workload_base_configs.py           | 8 ++++----
 .../configs/qwen_vl/qwen3_vl_workload_base_configs.py     | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_workload_base_configs.py
index 2cafda9789..18ed8d8096 100644
--- a/scripts/performance/configs/nemotronh/nemotron_3_workload_base_configs.py
+++ b/scripts/performance/configs/nemotronh/nemotron_3_workload_base_configs.py
@@ -41,7 +41,7 @@
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    micro_batch_size=4,
+    micro_batch_size=2,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
@@ -59,7 +59,7 @@
 
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    micro_batch_size=4,
+    micro_batch_size=2,
 )
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
 NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
index 9cb42e6722..d9397fc7c9 100644
--- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
+++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
@@ -325,7 +325,7 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_GB300_BF16_V1 = replace(
     BASE_QWEN3_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],
@@ -335,7 +335,7 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = replace(
     BASE_QWEN3_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],
@@ -382,7 +382,7 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
     BASE_QWEN3_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],
@@ -392,7 +392,7 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_B300_FP8_CS_V1 = replace(
     BASE_QWEN3_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],
diff --git a/scripts/performance/configs/qwen_vl/qwen3_vl_workload_base_configs.py b/scripts/performance/configs/qwen_vl/qwen3_vl_workload_base_configs.py
index e81cef05f4..07e9934d9a 100644
--- a/scripts/performance/configs/qwen_vl/qwen3_vl_workload_base_configs.py
+++ b/scripts/performance/configs/qwen_vl/qwen3_vl_workload_base_configs.py
@@ -144,7 +144,7 @@
 QWEN3_VL_30B_A3B_PRETRAIN_CONFIG_GB300_BF16 = replace(
     BASE_QWEN3_VL_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],
@@ -154,7 +154,7 @@
 QWEN3_VL_30B_A3B_PRETRAIN_CONFIG_GB300_FP8_CS = replace(
     BASE_QWEN3_VL_30B_A3B_CONFIG,
     num_gpus=8,
-    micro_batch_size=8,
+    micro_batch_size=4,
     moe_flex_dispatcher_backend="hybridep",
     cuda_graph_impl="transformer_engine",
     cuda_graph_scope=["moe_router", "moe_preprocess"],