From 2612caa89626b1b12254dab3b56a1ff1fcf9f62d Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Tue, 13 Jan 2026 16:58:45 -0800 Subject: [PATCH 1/6] Update perf reproduce instructions Signed-off-by: Parth Mannan --- examples/megatron/recipes/wan/README_perf_test.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md index d95a4c14..8a8b20e6 100644 --- a/examples/megatron/recipes/wan/README_perf_test.md +++ b/examples/megatron/recipes/wan/README_perf_test.md @@ -5,7 +5,7 @@ This guide provides concise steps to set up the environment and run WAN pretrain ## Container Launch ```bash -CONT="nvcr.io/nvidia/nemo:25.09.00" +CONT="nvcr.io/nvidia/nemo:25.11" MOUNT="/lustre/fsw/:/lustre/fsw/" srun -t 02:00:00 \ @@ -28,18 +28,18 @@ cd /opt/ # DFM (pinned) git clone --no-checkout https://github.com/NVIDIA-NeMo/DFM.git -git -C DFM checkout 174bb7b34de002ebbbcae1ba8e2b12363c7dee01 +git -C DFM checkout 9eaace14995a724c982fe53726a909be2edc93cb export DFM_PATH=/opt/DFM # Megatron-Bridge (pinned) rm -rf /opt/Megatron-Bridge -git clone --no-checkout https://github.com/huvunvidia/Megatron-Bridge.git -git -C Megatron-Bridge checkout 713ab548e4bfee307eb94a7bb3f57c17dbb31b50 +git clone --no-checkout https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +git -C Megatron-Bridge checkout 953aabf75c0500180dc14a6a76cf9e7e7c4baec7 # Megatron-LM (pinned) rm -rf /opt/Megatron-LM git clone --no-checkout https://github.com/NVIDIA/Megatron-LM.git -git -C Megatron-LM checkout ce8185cbbe04f38beb74360e878450f2e8525885 +git -C Megatron-LM checkout 2d398b42fd4237fffb553109563d73ac099751c3 # Python path export PYTHONPATH="${DFM_PATH}/.:/opt/Megatron-Bridge/.:/opt/Megatron-LM" @@ -143,6 +143,11 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret - Using `--mock` argument. - Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = F * H * W * number_packed_samples`. +### Reproducing performance recipes + +- Please use the appropriate system config recipe in `examples/megatron/recipes/wan/conf/_perf_pretrain_mock.yaml` +- Usage example `examples/megatron/recipes/wan/pretrain_wan.py --mock --training-mode pretrain --config-file examples/megatron/recipes/wan/conf/gb300_perf_pretrain_mock.yaml` + ## Inference ```bash From 87eb81723eca8fee37450d29f36c832c624db2f2 Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Wed, 14 Jan 2026 09:46:32 -0800 Subject: [PATCH 2/6] Docs update Signed-off-by: Parth Mannan --- docs/performance-summary.md | 2 +- examples/megatron/recipes/wan/README_perf_test.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/performance-summary.md b/docs/performance-summary.md index 3876e485..e150c0dc 100644 --- a/docs/performance-summary.md +++ b/docs/performance-summary.md @@ -53,7 +53,7 @@ The performance data includes: | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| -|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,022.26| +|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,030.67| #### System: DGX-H100 diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md index 8a8b20e6..59d28dd6 100644 --- a/examples/megatron/recipes/wan/README_perf_test.md +++ b/examples/megatron/recipes/wan/README_perf_test.md @@ -147,6 +147,7 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret - Please use the appropriate system config recipe in `examples/megatron/recipes/wan/conf/_perf_pretrain_mock.yaml` - Usage example `examples/megatron/recipes/wan/pretrain_wan.py --mock --training-mode pretrain --config-file examples/megatron/recipes/wan/conf/gb300_perf_pretrain_mock.yaml` +- Note that the FLOPs calculation for Wan 2.1 is not currently supported in Megatron-Bridge. Please use a manual calculator until a fix is made. ## Inference From 7290d774558743dbaf525a07373ec6212f848f1b Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Mon, 9 Mar 2026 15:17:46 -0700 Subject: [PATCH 3/6] Adding B200 Perf recipe and numbers for 25.11 Signed-off-by: Parth Mannan --- docs/performance-summary.md | 8 +++- .../wan/conf/b200_perf_pretrain_mock.yaml | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml diff --git a/docs/performance-summary.md b/docs/performance-summary.md index e150c0dc..fa5d6d82 100644 --- a/docs/performance-summary.md +++ b/docs/performance-summary.md @@ -46,7 +46,7 @@ The performance data includes: | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| -|Wan 2.1 14B|32|64|1|37440|0|1|0|1|4|0|0|787.59| +|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|899.62| #### System: DGX-GB300 @@ -55,6 +55,12 @@ The performance data includes: |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| |Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,030.67| +#### System: DGX-B200 + +| Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | +|-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| +|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|813.43| + #### System: DGX-H100 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | diff --git a/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml new file mode 100644 index 00000000..13fc0c8d --- /dev/null +++ b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml @@ -0,0 +1,43 @@ +model: + tensor_model_parallel_size: 1 + sequence_parallel: false + pipeline_model_parallel_size: 1 + context_parallel_size: 2 + crossattn_emb_size: 5120 + hidden_size: 5120 + ffn_hidden_size: 13824 + num_attention_heads: 40 + num_layers: 40 + qkv_format: thd + seq_length: 2048 # This is not used + +train: + global_batch_size: 64 + micro_batch_size: 1 + eval_iters: 0 + +scheduler: + lr_decay_style: constant + lr_warmup_iters: 0 + +optimizer: + lr: 5e-6 + min_lr: 5e-6 + +dataset: + seq_length: 2048 # This is not used + global_batch_size: 64 + micro_batch_size: 1 + +logger: + log_interval: 1 + +ddp: + use_megatron_fsdp: true + data_parallel_sharding_strategy: "optim_grads_params" + +dist: + use_megatron_fsdp: true + +checkpoint: + ckpt_format: "fsdp_dtensor" From da112979ec8b192e286f091ad1a1dd638e43b575 Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Mon, 9 Mar 2026 15:19:22 -0700 Subject: [PATCH 4/6] Update seq len formula in README Signed-off-by: Parth Mannan --- examples/megatron/recipes/wan/README_perf_test.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md index 59d28dd6..d101ca33 100644 --- a/examples/megatron/recipes/wan/README_perf_test.md +++ b/examples/megatron/recipes/wan/README_perf_test.md @@ -141,7 +141,7 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret ### Using mock data (optional, for debugging) - Using `--mock` argument. -- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = F * H * W * number_packed_samples`. +- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = (F_latents // patch_temporal) * (H_latents // patch_temporal) * (W_latents // patch_temporal) * number_packed_samples`. ### Reproducing performance recipes From 6375ddb43f4f75e56ee0c8753ecb07da3cbfb9c9 Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Mon, 9 Mar 2026 15:43:36 -0700 Subject: [PATCH 5/6] Update GB200 recipe with FSDP Signed-off-by: Parth Mannan --- .../recipes/wan/conf/gb200_perf_pretrain_mock.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml index 7b170d36..13fc0c8d 100644 --- a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml +++ b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml @@ -2,7 +2,7 @@ model: tensor_model_parallel_size: 1 sequence_parallel: false pipeline_model_parallel_size: 1 - context_parallel_size: 4 + context_parallel_size: 2 crossattn_emb_size: 5120 hidden_size: 5120 ffn_hidden_size: 13824 @@ -31,3 +31,13 @@ dataset: logger: log_interval: 1 + +ddp: + use_megatron_fsdp: true + data_parallel_sharding_strategy: "optim_grads_params" + +dist: + use_megatron_fsdp: true + +checkpoint: + ckpt_format: "fsdp_dtensor" From 903eb4457c82389ac662894cd7b14f587bc47cee Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Tue, 10 Mar 2026 11:25:26 -0700 Subject: [PATCH 6/6] Update Signed-off-by: Parth Mannan --- docs/performance-summary.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/performance-summary.md b/docs/performance-summary.md index fa5d6d82..791eb7b9 100644 --- a/docs/performance-summary.md +++ b/docs/performance-summary.md @@ -59,7 +59,7 @@ The performance data includes: | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| -|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|813.43| +|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|804.02| #### System: DGX-H100