From 2612caa89626b1b12254dab3b56a1ff1fcf9f62d Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Tue, 13 Jan 2026 16:58:45 -0800
Subject: [PATCH 1/6] Update perf reproduce instructions

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 examples/megatron/recipes/wan/README_perf_test.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md
index d95a4c14..8a8b20e6 100644
--- a/examples/megatron/recipes/wan/README_perf_test.md
+++ b/examples/megatron/recipes/wan/README_perf_test.md
@@ -5,7 +5,7 @@ This guide provides concise steps to set up the environment and run WAN pretrain
 ## Container Launch
 
 ```bash
-CONT="nvcr.io/nvidia/nemo:25.09.00"
+CONT="nvcr.io/nvidia/nemo:25.11"
 MOUNT="/lustre/fsw/:/lustre/fsw/"
 
 srun -t 02:00:00 \
@@ -28,18 +28,18 @@ cd /opt/
 
 # DFM (pinned)
 git clone --no-checkout https://github.com/NVIDIA-NeMo/DFM.git
-git -C DFM checkout 174bb7b34de002ebbbcae1ba8e2b12363c7dee01
+git -C DFM checkout 9eaace14995a724c982fe53726a909be2edc93cb
 export DFM_PATH=/opt/DFM
 
 # Megatron-Bridge (pinned)
 rm -rf /opt/Megatron-Bridge
-git clone --no-checkout https://github.com/huvunvidia/Megatron-Bridge.git
-git -C Megatron-Bridge checkout 713ab548e4bfee307eb94a7bb3f57c17dbb31b50
+git clone --no-checkout https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+git -C Megatron-Bridge checkout 953aabf75c0500180dc14a6a76cf9e7e7c4baec7
 
 # Megatron-LM (pinned)
 rm -rf /opt/Megatron-LM
 git clone --no-checkout https://github.com/NVIDIA/Megatron-LM.git
-git -C Megatron-LM checkout ce8185cbbe04f38beb74360e878450f2e8525885
+git -C Megatron-LM checkout 2d398b42fd4237fffb553109563d73ac099751c3
 
 # Python path
 export PYTHONPATH="${DFM_PATH}/.:/opt/Megatron-Bridge/.:/opt/Megatron-LM"
@@ -143,6 +143,11 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret
 - Using `--mock` argument.
 - Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = F * H * W * number_packed_samples`.
 
+### Reproducing performance recipes
+
+- Please use the appropriate system config recipe in `examples/megatron/recipes/wan/conf/<h100/gb200/gb300>_perf_pretrain_mock.yaml`
+- Usage example `examples/megatron/recipes/wan/pretrain_wan.py --mock --training-mode pretrain --config-file examples/megatron/recipes/wan/conf/gb300_perf_pretrain_mock.yaml`
+
 ## Inference
 
 ```bash

From 87eb81723eca8fee37450d29f36c832c624db2f2 Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Wed, 14 Jan 2026 09:46:32 -0800
Subject: [PATCH 2/6] Docs update

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 docs/performance-summary.md                       | 2 +-
 examples/megatron/recipes/wan/README_perf_test.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/performance-summary.md b/docs/performance-summary.md
index 3876e485..e150c0dc 100644
--- a/docs/performance-summary.md
+++ b/docs/performance-summary.md
@@ -53,7 +53,7 @@ The performance data includes:
 
 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU |
 |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------|
-|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,022.26|
+|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,030.67|
 
 #### System: DGX-H100
 
diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md
index 8a8b20e6..59d28dd6 100644
--- a/examples/megatron/recipes/wan/README_perf_test.md
+++ b/examples/megatron/recipes/wan/README_perf_test.md
@@ -147,6 +147,7 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret
 
 - Please use the appropriate system config recipe in `examples/megatron/recipes/wan/conf/<h100/gb200/gb300>_perf_pretrain_mock.yaml`
 - Usage example `examples/megatron/recipes/wan/pretrain_wan.py --mock --training-mode pretrain --config-file examples/megatron/recipes/wan/conf/gb300_perf_pretrain_mock.yaml`
+- Note that the FLOPs calculation for Wan 2.1 is not currently supported in Megatron-Bridge. Please use a manual calculator until a fix is made.
 
 ## Inference
 

From 7290d774558743dbaf525a07373ec6212f848f1b Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Mon, 9 Mar 2026 15:17:46 -0700
Subject: [PATCH 3/6] Adding B200 Perf recipe and numbers for 25.11

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 docs/performance-summary.md                   |  8 +++-
 .../wan/conf/b200_perf_pretrain_mock.yaml     | 43 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml

diff --git a/docs/performance-summary.md b/docs/performance-summary.md
index e150c0dc..fa5d6d82 100644
--- a/docs/performance-summary.md
+++ b/docs/performance-summary.md
@@ -46,7 +46,7 @@ The performance data includes:
 
 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU |
 |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------|
-|Wan 2.1 14B|32|64|1|37440|0|1|0|1|4|0|0|787.59|
+|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|899.62|
 
 
 #### System: DGX-GB300
@@ -55,6 +55,12 @@ The performance data includes:
 |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------|
 |Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,030.67|
 
+#### System: DGX-B200
+
+| Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU |
+|-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------|
+|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|813.43|
+
 #### System: DGX-H100
 
 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU |
diff --git a/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml
new file mode 100644
index 00000000..13fc0c8d
--- /dev/null
+++ b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml
@@ -0,0 +1,43 @@
+model:
+  tensor_model_parallel_size: 1
+  sequence_parallel: false
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 2
+  crossattn_emb_size: 5120
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  num_layers: 40
+  qkv_format: thd
+  seq_length: 2048 # This is not used
+
+train:
+  global_batch_size: 64
+  micro_batch_size: 1
+  eval_iters: 0
+
+scheduler:
+  lr_decay_style: constant
+  lr_warmup_iters: 0
+
+optimizer:
+  lr: 5e-6
+  min_lr: 5e-6
+
+dataset:
+  seq_length: 2048 # This is not used
+  global_batch_size: 64
+  micro_batch_size: 1
+
+logger:
+  log_interval: 1
+
+ddp:
+  use_megatron_fsdp: true
+  data_parallel_sharding_strategy: "optim_grads_params"
+
+dist:
+  use_megatron_fsdp: true
+
+checkpoint:
+  ckpt_format: "fsdp_dtensor"

From da112979ec8b192e286f091ad1a1dd638e43b575 Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Mon, 9 Mar 2026 15:19:22 -0700
Subject: [PATCH 4/6] Update seq len formula in README

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 examples/megatron/recipes/wan/README_perf_test.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md
index 59d28dd6..d101ca33 100644
--- a/examples/megatron/recipes/wan/README_perf_test.md
+++ b/examples/megatron/recipes/wan/README_perf_test.md
@@ -141,7 +141,7 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret
 ### Using mock data (optional, for debugging)
 
 - Using `--mock` argument.
-- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = F * H * W * number_packed_samples`.
+- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = (F_latents // patch_temporal) * (H_latents // patch_temporal) * (W_latents // patch_temporal) * number_packed_samples`.
 
 ### Reproducing performance recipes
 

From 6375ddb43f4f75e56ee0c8753ecb07da3cbfb9c9 Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Mon, 9 Mar 2026 15:43:36 -0700
Subject: [PATCH 5/6] Update GB200 recipe with FSDP

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 .../recipes/wan/conf/gb200_perf_pretrain_mock.yaml   | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml
index 7b170d36..13fc0c8d 100644
--- a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml
+++ b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml
@@ -2,7 +2,7 @@ model:
   tensor_model_parallel_size: 1
   sequence_parallel: false
   pipeline_model_parallel_size: 1
-  context_parallel_size: 4
+  context_parallel_size: 2
   crossattn_emb_size: 5120
   hidden_size: 5120
   ffn_hidden_size: 13824
@@ -31,3 +31,13 @@ dataset:
 
 logger:
   log_interval: 1
+
+ddp:
+  use_megatron_fsdp: true
+  data_parallel_sharding_strategy: "optim_grads_params"
+
+dist:
+  use_megatron_fsdp: true
+
+checkpoint:
+  ckpt_format: "fsdp_dtensor"

From 903eb4457c82389ac662894cd7b14f587bc47cee Mon Sep 17 00:00:00 2001
From: Parth Mannan <pmannan@nvidia.com>
Date: Tue, 10 Mar 2026 11:25:26 -0700
Subject: [PATCH 6/6] Update

Signed-off-by: Parth Mannan <pmannan@nvidia.com>
---
 docs/performance-summary.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/performance-summary.md b/docs/performance-summary.md
index fa5d6d82..791eb7b9 100644
--- a/docs/performance-summary.md
+++ b/docs/performance-summary.md
@@ -59,7 +59,7 @@ The performance data includes:
 
 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU |
 |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------|
-|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|813.43|
+|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|804.02|
 
 #### System: DGX-H100