From 675ce17d43fe77891c09b1df6b585647ca252e9a Mon Sep 17 00:00:00 2001
From: Rahul Salagame <rsalagame@nvidia.com>
Date: Fri, 8 May 2026 16:13:36 -0700
Subject: [PATCH 1/3] [performance] feat: add 405B B200/B300 V2 aliases +
 re-export them in configs.llama

Signed-off-by: Rahul Salagame <rsalagame@nvidia.com>
---
 scripts/performance/configs/llama/__init__.py | 16 ++++++++++++++++
 .../llama/llama31_workload_base_configs.py    | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py
index 89548627ef..dc7949092f 100644
--- a/scripts/performance/configs/llama/__init__.py
+++ b/scripts/performance/configs/llama/__init__.py
@@ -138,13 +138,21 @@
 )
 from .llama31_workload_base_configs import (
     LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V1,
+    LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V1,
     LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2,
     LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V1,
@@ -200,6 +208,14 @@
     "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2",
diff --git a/scripts/performance/configs/llama/llama31_workload_base_configs.py b/scripts/performance/configs/llama/llama31_workload_base_configs.py
index b3cc7a6422..7e62c3d694 100644
--- a/scripts/performance/configs/llama/llama31_workload_base_configs.py
+++ b/scripts/performance/configs/llama/llama31_workload_base_configs.py
@@ -293,6 +293,17 @@
 LLAMA31_405B_PRETRAIN_CONFIG_VR200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2
 
 
+LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_BF16_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2
+
+LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2
+LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2
+
+
 LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2 = replace(
     LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V1,
     global_batch_size=1536,
@@ -334,6 +345,14 @@
     "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2",
+    "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2",
     "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2",

From 28cd9707eddf091743f5f80a1fa3bcac2b9f18b3 Mon Sep 17 00:00:00 2001
From: Rahul Salagame <rsalagame@nvidia.com>
Date: Fri, 8 May 2026 16:19:11 -0700
Subject: [PATCH 2/3] gb200 405b nvfp4 256x expendable segments: True resolves
 memory constraints on GB200. Performance remains same.

Signed-off-by: Rahul Salagame <rsalagame@nvidia.com>
---
 scripts/performance/perf_plugins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 7e3e53f5ff..58a4eac849 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -289,7 +289,7 @@ def _set_model_specific_environment_variables(
             and train_task == "pretrain"
             and gpu in ["gb200", "gb300"]
         ):
-            if compute_dtype in ["fp8_cs", "fp8_mx"]:
+            if compute_dtype in ["fp8_cs", "fp8_mx", ""nvfp4"]:
                 executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
                 executor.env_vars["NCCL_GRAPH_REGISTER"] = "0"
         elif (

From 1aec6cdb3e6a46587d435aff12d11ce72836e650 Mon Sep 17 00:00:00 2001
From: Rahul Salagame <rsalagame@nvidia.com>
Date: Sun, 10 May 2026 20:09:03 -0700
Subject: [PATCH 3/3] resolved extra quote nvfp4

Signed-off-by: Rahul Salagame <rsalagame@nvidia.com>
---
 scripts/performance/perf_plugins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 58a4eac849..89f2133b88 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -289,7 +289,7 @@ def _set_model_specific_environment_variables(
             and train_task == "pretrain"
             and gpu in ["gb200", "gb300"]
         ):
-            if compute_dtype in ["fp8_cs", "fp8_mx", ""nvfp4"]:
+            if compute_dtype in ["fp8_cs", "fp8_mx", "nvfp4"]:
                 executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
                 executor.env_vars["NCCL_GRAPH_REGISTER"] = "0"
         elif (