From 675ce17d43fe77891c09b1df6b585647ca252e9a Mon Sep 17 00:00:00 2001 From: Rahul Salagame Date: Fri, 8 May 2026 16:13:36 -0700 Subject: [PATCH 1/3] [performance] feat: add 405B B200/B300 V2 aliases + re-export them in configs.llama Signed-off-by: Rahul Salagame --- scripts/performance/configs/llama/__init__.py | 16 ++++++++++++++++ .../llama/llama31_workload_base_configs.py | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 89548627ef..dc7949092f 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -138,13 +138,21 @@ ) from .llama31_workload_base_configs import ( LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2, LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V1, LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V1, @@ -200,6 +208,14 @@ "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2", diff --git a/scripts/performance/configs/llama/llama31_workload_base_configs.py b/scripts/performance/configs/llama/llama31_workload_base_configs.py index b3cc7a6422..7e62c3d694 100644 --- a/scripts/performance/configs/llama/llama31_workload_base_configs.py +++ b/scripts/performance/configs/llama/llama31_workload_base_configs.py @@ -293,6 +293,17 @@ LLAMA31_405B_PRETRAIN_CONFIG_VR200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_BF16_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2 + +LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2 + + LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2 = replace( LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V1, global_batch_size=1536, @@ -334,6 +345,14 @@ "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2", From 28cd9707eddf091743f5f80a1fa3bcac2b9f18b3 Mon Sep 17 00:00:00 2001 From: Rahul Salagame Date: Fri, 8 May 2026 16:19:11 -0700 Subject: [PATCH 2/3] gb200 405b nvfp4 256x expendable segments: True resolves memory constraints on GB200. Performance remains same. Signed-off-by: Rahul Salagame --- scripts/performance/perf_plugins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 7e3e53f5ff..58a4eac849 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -289,7 +289,7 @@ def _set_model_specific_environment_variables( and train_task == "pretrain" and gpu in ["gb200", "gb300"] ): - if compute_dtype in ["fp8_cs", "fp8_mx"]: + if compute_dtype in ["fp8_cs", "fp8_mx", ""nvfp4"]: executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" elif ( From 1aec6cdb3e6a46587d435aff12d11ce72836e650 Mon Sep 17 00:00:00 2001 From: Rahul Salagame Date: Sun, 10 May 2026 20:09:03 -0700 Subject: [PATCH 3/3] resolved extra quote nvfp4 Signed-off-by: Rahul Salagame --- scripts/performance/perf_plugins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 58a4eac849..89f2133b88 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -289,7 +289,7 @@ def _set_model_specific_environment_variables( and train_task == "pretrain" and gpu in ["gb200", "gb300"] ): - if compute_dtype in ["fp8_cs", "fp8_mx", ""nvfp4"]: + if compute_dtype in ["fp8_cs", "fp8_mx", "nvfp4"]: executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" elif (