Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scripts/performance/configs/deepseek/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_FP8_MX_V2,
DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_NVFP4_V1,
DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_NVFP4_V2,
# FSDP
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_FSDP,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_FSDP,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_FSDP,
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_FSDP,
)


Expand Down Expand Up @@ -125,6 +130,11 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_LARGE_SCALE",
"DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_LARGE_SCALE",
"DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_LARGE_SCALE",
# FSDP
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_FSDP",
]

if HAVE_MEGATRON_BRIDGE:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

V1: GBS=2048 for Blackwell variants, GBS=8192 for H100
V2: GBS=4096 for Blackwell variants, GBS=16384 for H100
FSDP: FSDP-based, no PP, GBS=256 for GB300 (64 GPUs)

Use --config_variant to select a variant.
Use --list_config_variants to see available variants interactively.
Expand Down Expand Up @@ -259,6 +260,37 @@
)


# =============================================================================
# DeepSeek V3 Pretrain - FSDP (FSDP, no PP, GBS=256 for GB300)
# =============================================================================

DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FSDP = replace(
BASE_DEEPSEEK_V3_CONFIG,
num_gpus=64,
global_batch_size=256,
micro_batch_size=2,
pipeline_model_parallel_size=1,
expert_model_parallel_size=64,
use_megatron_fsdp=True,
moe_flex_dispatcher_backend="hybridep",
moe_a2a_overlap=False,
cuda_graph_scope=[],
recompute_modules=["layernorm", "mla_up_proj", "moe_act"],
fine_grained_activation_offloading=True,
offload_modules=["core_attn", "attn_proj"],
fp8_param_gather=True,
reuse_grad_buf_for_mxfp8_param_ag=True,
)
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_FSDP = replace(
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FSDP,
fp8_param_gather=None,
reuse_grad_buf_for_mxfp8_param_ag=None,
)
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_FSDP = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FSDP
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_FSDP = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FSDP
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_FSDP = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FSDP


# =============================================================================
# DeepSeek V3 Pretrain - Large Scale Proxy
# =============================================================================
Expand Down Expand Up @@ -344,6 +376,11 @@
"DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_FP8_CS_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_FP8_MX_V2",
"DEEPSEEK_V3_PRETRAIN_CONFIG_VR200_NVFP4_V2",
# FSDP (FSDP, GBS=256 for GB300)
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_FSDP",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_FSDP",
# Large Scale Proxy
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_LARGE_SCALE",
"DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_LARGE_SCALE",
Expand Down
9 changes: 9 additions & 0 deletions scripts/performance/utils/overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,15 @@ def set_workload_base_configs(cfg: ConfigContainer, settings: WorkloadBaseConfig
cfg.model.quant_recipe = load_quantization_recipe(settings.te_precision_config_file)
_set_common_perf_overrides(cfg)

if settings.fine_grained_activation_offloading is not None:
cfg.model.fine_grained_activation_offloading = settings.fine_grained_activation_offloading
if settings.offload_modules is not None:
cfg.model.offload_modules = settings.offload_modules
if settings.fp8_param_gather is not None:
cfg.mixed_precision.fp8_param_gather = settings.fp8_param_gather
if settings.reuse_grad_buf_for_mxfp8_param_ag is not None:
cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = settings.reuse_grad_buf_for_mxfp8_param_ag

if settings.moe_flex_dispatcher_backend is not None:
apply_flex_dispatcher_backend(cfg.model, settings.moe_flex_dispatcher_backend)
elif hasattr(cfg.model, "moe_token_dispatcher_type"):
Expand Down
8 changes: 8 additions & 0 deletions scripts/performance/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ class WorkloadBaseConfig:
recompute_num_layers: Optional[int] = None
recompute_modules: Optional[List[str]] = None

# Fine-grained activation offloading
fine_grained_activation_offloading: Optional[bool] = None
offload_modules: Optional[List[str]] = None

# FP8 parameter gather settings (used with FSDP)
fp8_param_gather: Optional[bool] = None
reuse_grad_buf_for_mxfp8_param_ag: Optional[bool] = None

# MoE configuration
moe_flex_dispatcher_backend: Optional[str] = None
moe_a2a_overlap: Optional[bool] = False
Expand Down
Loading