diff --git a/scripts/performance/configs/llama/__init__.py b/scripts/performance/configs/llama/__init__.py index 89548627ef..dc7949092f 100644 --- a/scripts/performance/configs/llama/__init__.py +++ b/scripts/performance/configs/llama/__init__.py @@ -138,13 +138,21 @@ ) from .llama31_workload_base_configs import ( LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2, LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2, LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V1, + LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2, LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V1, LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2, LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V1, @@ -200,6 +208,14 @@ "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2", diff --git a/scripts/performance/configs/llama/llama31_workload_base_configs.py b/scripts/performance/configs/llama/llama31_workload_base_configs.py index b3cc7a6422..7e62c3d694 100644 --- a/scripts/performance/configs/llama/llama31_workload_base_configs.py +++ b/scripts/performance/configs/llama/llama31_workload_base_configs.py @@ -293,6 +293,17 @@ LLAMA31_405B_PRETRAIN_CONFIG_VR200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_BF16_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_CS_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_FP8_MX_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V2 + +LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_BF16_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2 +LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2 = LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2 + + LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2 = replace( LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V1, global_batch_size=1536, @@ -334,6 +345,14 @@ "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", "LLAMA31_405B_PRETRAIN_CONFIG_GB200_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B300_NVFP4_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_BF16_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_CS_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "LLAMA31_405B_PRETRAIN_CONFIG_B200_NVFP4_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_BF16_V2", "LLAMA31_405B_PRETRAIN_CONFIG_H100_FP8_CS_V2", "LLAMA31_405B_PRETRAIN_CONFIG_VR200_BF16_V2", diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 7e3e53f5ff..89f2133b88 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -289,7 +289,7 @@ def _set_model_specific_environment_variables( and train_task == "pretrain" and gpu in ["gb200", "gb300"] ): - if compute_dtype in ["fp8_cs", "fp8_mx"]: + if compute_dtype in ["fp8_cs", "fp8_mx", "nvfp4"]: executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" elif (