From 60294bf8794f2e350422943549883de8ae962b73 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@eos0156.eos.clusters.nvidia.com>
Date: Tue, 5 May 2026 10:05:14 -0700
Subject: [PATCH 01/12] workable code

---
 .../qwen_vl/data/energon/task_encoder.py      | 68 ++++++++++++++++++-
 .../bridge/recipes/qwen_vl/qwen3_vl.py        | 36 ++++++++--
 .../bridge/training/utils/visual_inputs.py    | 19 +++++-
 .../recipes/qwen_vl/test_qwen3_vl_recipes.py  |  4 ++
 4 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index 50178a0749..ef33dc758b 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 import torch
-from megatron.energon import Batch, DefaultTaskEncoder
+from megatron.energon import Batch, DefaultTaskEncoder, SkipSample
 from transformers import BatchEncoding
 
 from megatron.bridge.data.energon.task_encoder_utils import (
@@ -39,6 +39,7 @@
 from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
 
 
+
 def process_vision(
     processor, images, videos, fps=None, model_version: str = "qwen-vl", min_pixels=None, max_pixels=None
 ):
@@ -56,7 +57,12 @@ def process_vision(
         image_grid_thw = None
 
     if videos is not None:
-        videos_inputs = processor(images=None, text="", videos=videos, return_tensors="pt")
+        # DEBUGGING
+        # videos_inputs = processor(images=None, text="", videos=videos, return_tensors="pt")  
+        # Pre-decoded frames from WDS are already at the desired sampling rate.
+        # do_sample_frames=False prevents the processor from re-sampling them under
+        # a spurious 24 fps assumption, which would reduce most clips to T=2.
+        videos_inputs = processor.video_processor(videos=videos, return_tensors="pt", do_sample_frames=False)
         video_grid_thw = videos_inputs.get("video_grid_thw", None)
     else:
         videos_inputs = {}
@@ -168,6 +174,9 @@ def __init__(
         max_padding_length: int = 4096,
         min_pixels: int = 200704,
         max_pixels: int = 1003520,
+        max_num_images: int | None = 10,
+        max_num_frames: int | None = 60,
+        max_visual_tokens: int | None = 16384,
     ):
         super().__init__()
 
@@ -176,6 +185,9 @@ def __init__(
         self.seq_length = max_padding_length
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
+        self.max_num_images = max_num_images
+        self.max_num_frames = max_num_frames
+        self.max_visual_tokens = max_visual_tokens
 
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = spatial_merge_size
@@ -202,6 +214,34 @@ def encode_sample(self, sample: ChatMLSample):
         videos_for_processing = (
             _videos_to_pil(sample.videos) if sample.videos is not None and len(sample.videos) > 0 else None
         )
+
+        if self.max_num_images is not None and imgs_for_processing is not None:
+            if len(imgs_for_processing) > self.max_num_images:
+                logging.warning(
+                    "Skipping sample %s: %d images exceeds max_num_images=%d",
+                    sample.__key__,
+                    len(imgs_for_processing),
+                    self.max_num_images,
+                )
+                print(f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {len(imgs_for_processing)} images, which exceeds max_num_images={self.max_num_images}")
+                raise SkipSample()
+
+        if self.max_num_frames is not None and videos_for_processing is not None:
+            clipped = []
+            for v in videos_for_processing:
+                if len(v) > self.max_num_frames:
+                    logging.warning(
+                        "Truncating %d frames to max_num_frames=%d for sample %s",
+                        len(v),
+                        self.max_num_frames,
+                        sample.__key__,
+                    )
+                    print(f"[DEBUG] (task_encoder.py) Truncating {len(v)} frames to max_num_frames={self.max_num_frames} for sample {sample.__key__}")
+                    clipped.append(v[: self.max_num_frames])
+                else:
+                    clipped.append(v)
+            videos_for_processing = clipped
+
         processed_vision = process_vision(
             self.image_processor,
             imgs_for_processing,
@@ -214,6 +254,21 @@ def encode_sample(self, sample: ChatMLSample):
         flattened_imgs = processed_vision["image_inputs"]
         flattened_videos = processed_vision["video_inputs"]
 
+        merge_length = self.merge_size**2
+        image_tokens = int(image_thw_grids.prod(dim=-1).sum().item()) // merge_length if image_thw_grids is not None else 0
+        video_tokens = int(video_thw_grids.prod(dim=-1).sum().item()) // merge_length if video_thw_grids is not None else 0
+        total_visual_tokens = image_tokens + video_tokens
+        if self.max_visual_tokens is not None:
+            if total_visual_tokens > self.max_visual_tokens:
+                logging.warning(
+                    "Skipping sample %s: %d visual tokens exceeds max_visual_tokens=%d",
+                    sample.__key__,
+                    total_visual_tokens,
+                    self.max_visual_tokens,
+                )
+                print(f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {total_visual_tokens} visual tokens, which exceeds max_visual_tokens={self.max_visual_tokens}")
+                raise SkipSample()
+
         # Normalize conversation to [{"role": ..., "content": ...}, ...]
         conversation = cook_chatml_sample(sample.conversation)
 
@@ -287,7 +342,12 @@ def encode_sample(self, sample: ChatMLSample):
             target_length = input_ids.shape[0]
 
         if target_length > self.seq_len:
-            logging.warning(f"Long sequence with length {target_length} found, dropped...")
+            if total_visual_tokens > self.seq_len:
+                logging.warning(
+                    f"Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
+                )
+                print(f"[DEBUG] (task_encoder.py) Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample.")
+                # raise SkipSample()
         final_input_ids = np.zeros(target_length, dtype=input_ids.dtype)
         final_input_masks = final_input_ids.copy()
 
@@ -435,7 +495,9 @@ def encode_batch(self, batch: QwenVLTaskBatch) -> dict:
 
         raw["visual_inputs"] = Qwen2_5_VLVisualInputs(
             pixel_values=batch.pixel_values,
+            pixel_values_videos=batch.pixel_values_videos,
             image_grid_thw=batch.image_grid_thw,
+            video_grid_thw=batch.video_grid_thw,
         )
 
         return raw
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
index 5f84009620..72e631e657 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -20,6 +20,7 @@
 from __future__ import annotations
 
 import os
+from dataclasses import dataclass
 from typing import Optional, Union
 
 import torch
@@ -28,6 +29,7 @@
 
 from megatron.bridge import AutoBridge
 from megatron.bridge.data.energon.energon_provider import EnergonProvider
+from megatron.bridge.data.utils import DatasetBuildContext
 from megatron.bridge.data.vlm_datasets import MockVLMConversationProvider
 from megatron.bridge.peft.base import PEFT
 from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
@@ -265,10 +267,36 @@ def qwen3_vl_235b_a22b_pretrain_mock_config(**user_kwargs: Unpack[Qwen3VLCommonK
     return _qwen3_vl_common(**combined_kwargs)
 
 
+@dataclass(kw_only=True)
+class QwenVLEnergonProvider(EnergonProvider):
+    """EnergonProvider subclass that exposes task-encoder knobs as CLI-overridable fields.
+
+    The task encoder is constructed eagerly (same as before), but build_datasets
+    syncs these fields onto it after CLI overrides have been applied.
+    """
+
+    min_pixels: int = 200704
+    max_pixels: int = 1003520
+    max_num_images: int | None = 10
+    max_num_frames: int | None = 60
+    max_visual_tokens: int | None = 16384
+
+    def build_datasets(self, context: DatasetBuildContext):
+        if self.task_encoder is not None:
+            self.task_encoder.seq_len = self.seq_length
+            self.task_encoder.seq_length = self.seq_length
+            self.task_encoder.min_pixels = self.min_pixels
+            self.task_encoder.max_pixels = self.max_pixels
+            self.task_encoder.max_num_images = self.max_num_images
+            self.task_encoder.max_num_frames = self.max_num_frames
+            self.task_encoder.max_visual_tokens = self.max_visual_tokens
+        return super().build_datasets(context)
+
+
 def _make_energon_dataset(
     hf_path: str, seq_length: int, micro_batch_size: int, global_batch_size: int
-) -> EnergonProvider:
-    """Create an EnergonProvider dataset config for Qwen3-VL recipes."""
+) -> QwenVLEnergonProvider:
+    """Create a QwenVLEnergonProvider dataset config for Qwen3-VL recipes."""
     tokenizer = AutoTokenizer.from_pretrained(hf_path)
     # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor).
     # This processor accepts both images and videos kwargs.
@@ -278,7 +306,7 @@ def _make_energon_dataset(
         image_processor=image_processor,
         max_padding_length=seq_length,
     )
-    return EnergonProvider(
+    return QwenVLEnergonProvider(
         path="",  # Must be set via CLI override: dataset.path=<path>
         seq_length=seq_length,
         micro_batch_size=micro_batch_size,
@@ -1140,5 +1168,5 @@ def qwen3_vl_8b_peft_energon_config(peft_scheme: str | PEFT = "lora") -> ConfigC
     """
     cfg = qwen3_vl_8b_peft_config(peft_scheme=peft_scheme)
     hf_path = "Qwen/Qwen3-VL-8B-Instruct"
-    cfg.dataset = _make_energon_dataset(hf_path, 4096, cfg.train.micro_batch_size, cfg.train.global_batch_size)
+    cfg.dataset = _make_energon_dataset(hf_path, cfg.model.seq_length, cfg.train.micro_batch_size, cfg.train.global_batch_size)
     return cfg
diff --git a/src/megatron/bridge/training/utils/visual_inputs.py b/src/megatron/bridge/training/utils/visual_inputs.py
index 638dc86d66..bc19ea9ba7 100644
--- a/src/megatron/bridge/training/utils/visual_inputs.py
+++ b/src/megatron/bridge/training/utils/visual_inputs.py
@@ -61,9 +61,15 @@ class Qwen2_5_VLVisualInputs:
     # Image tensors, e.g., Qwen2.5-VL processor output.
     pixel_values: Optional[torch.Tensor] = None
 
-    # Per-image temporal/spatial grid metadata (T, H, W) for videos, Qwen2.5-VL.
+    # Video tensors, e.g., Qwen2.5-VL processor output.
+    pixel_values_videos: Optional[torch.Tensor] = None
+
+    # Per-image (T, H, W) grid metadata.
     image_grid_thw: Optional[torch.Tensor] = None
 
+    # Per-video (T, H, W) grid metadata.
+    video_grid_thw: Optional[torch.Tensor] = None
+
     def as_model_kwargs(self) -> dict[str, torch.Tensor]:
         """Return a mapping of non-None fields suitable for model forward kwargs."""
         result: dict[str, torch.Tensor] = {}
@@ -77,7 +83,9 @@ def normalized_for_model(self) -> dict[str, torch.Tensor]:
         """Return non-None fields with shapes normalized for model expectations.
 
         - pixel_values: [B, N, C, H, W] -> [B*N, C, H, W]
+        - pixel_values_videos: [B, N, C, H, W] -> [B*N, C, H, W]
         - image_grid_thw: [B, N, 3] -> [B*N, 3]
+        - video_grid_thw: [B, N, 3] -> [B*N, 3]
         """
         kwargs = self.as_model_kwargs()
 
@@ -86,10 +94,19 @@ def normalized_for_model(self) -> dict[str, torch.Tensor]:
             b, n, c, h, w = pixel_values.shape
             kwargs["pixel_values"] = pixel_values.view(b * n, c, h, w)
 
+        pixel_values_videos = kwargs.get("pixel_values_videos")
+        if isinstance(pixel_values_videos, torch.Tensor) and pixel_values_videos.dim() == 5:
+            b, n, c, h, w = pixel_values_videos.shape
+            kwargs["pixel_values_videos"] = pixel_values_videos.view(b * n, c, h, w)
+
         image_grid_thw = kwargs.get("image_grid_thw")
         if isinstance(image_grid_thw, torch.Tensor) and image_grid_thw.dim() == 3:
             kwargs["image_grid_thw"] = image_grid_thw.view(-1, image_grid_thw.size(-1))
 
+        video_grid_thw = kwargs.get("video_grid_thw")
+        if isinstance(video_grid_thw, torch.Tensor) and video_grid_thw.dim() == 3:
+            kwargs["video_grid_thw"] = video_grid_thw.view(-1, video_grid_thw.size(-1))
+
         return kwargs
 
 
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
index 8b271447a4..4df792df2c 100644
--- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
@@ -568,8 +568,12 @@ def test_qwen3_vl_8b_peft_energon_task_encoder(monkeypatch: pytest.MonkeyPatch):
     cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
 
     from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder
+    from megatron.bridge.recipes.qwen_vl.qwen3_vl import QwenVLEnergonProvider
 
+    assert isinstance(cfg.dataset, QwenVLEnergonProvider)
     assert isinstance(cfg.dataset.task_encoder, QwenVLTaskEncoder)
+    assert cfg.dataset.min_pixels == 200704
+    assert cfg.dataset.max_pixels == 1003520
 
 
 # =============================================================================

From 33ddcd98cd6961c9c5ea676b8036d841eb9e0c4a Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Wed, 6 May 2026 07:59:29 -0700
Subject: [PATCH 02/12] adding inference code for Qwen3 for multi-images and
 video

---
 .../conversion/hf_to_megatron_generate_vlm.py | 49 ++++++++++++----
 examples/conversion/vlm_generate_utils.py     | 56 +++++++++++++++++++
 2 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py
index 6392aa34c6..bb6565069a 100644
--- a/examples/conversion/hf_to_megatron_generate_vlm.py
+++ b/examples/conversion/hf_to_megatron_generate_vlm.py
@@ -38,6 +38,8 @@
     pad_input_ids_to_tp_multiple,
     patch_kimi_vision_processor,
     process_image_inputs,
+    process_multi_image_inputs,
+    process_video_inputs,
     to_cuda,
 )
 
@@ -64,6 +66,8 @@ def __init__(
         image_grid_thw=None,
         image_sizes=None,
         mm_token_type_ids=None,
+        pixel_values_videos=None,
+        video_grid_thw=None,
     ):
         self.batch = dict(
             tokens=input_ids,
@@ -78,6 +82,10 @@ def __init__(
             self.batch["image_sizes"] = image_sizes
         if mm_token_type_ids is not None:
             self.batch["mm_token_type_ids"] = mm_token_type_ids
+        if pixel_values_videos is not None:
+            self.batch["pixel_values_videos"] = pixel_values_videos
+        if video_grid_thw is not None:
+            self.batch["video_grid_thw"] = video_grid_thw
         self._yielded = False
 
     def __iter__(self):
@@ -98,7 +106,7 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
         "position_ids": batch["position_ids"],
         "attention_mask": batch.get("attention_mask"),
     }
-    for key in ("pixel_values", "image_grid_thw", "image_sizes", "mm_token_type_ids"):
+    for key in ("pixel_values", "image_grid_thw", "image_sizes", "mm_token_type_ids", "pixel_values_videos", "video_grid_thw"):
         if key in batch:
             forward_args[key] = batch[key]
 
@@ -211,19 +219,33 @@ def _disable_mtp(m):
     # ------------------------------------------------------------------
     # Process inputs
     # ------------------------------------------------------------------
-    input_ids_raw, pixel_values, image_grid_thw, image_sizes, mm_token_type_ids = process_image_inputs(
-        processor,
-        args.image_path,
-        args.prompt,
-        is_kimi=is_kimi,
-        image_token_id=image_token_id,
-    )
+    pixel_values = image_grid_thw = image_sizes = mm_token_type_ids = None
+    pixel_values_videos = video_grid_thw = None
+
+    if args.video_path:
+        input_ids_raw, pixel_values_videos, video_grid_thw = process_video_inputs(
+            processor, args.video_path, args.prompt, fps=args.video_fps
+        )
+    elif args.image_paths:
+        input_ids_raw, pixel_values, image_grid_thw = process_multi_image_inputs(
+            processor, args.image_paths, args.prompt
+        )
+    else:
+        input_ids_raw, pixel_values, image_grid_thw, image_sizes, mm_token_type_ids = process_image_inputs(
+            processor,
+            args.image_path,
+            args.prompt,
+            is_kimi=is_kimi,
+            image_token_id=image_token_id,
+        )
 
     input_ids_raw = input_ids_raw.cuda()
     pixel_values = to_cuda(pixel_values)
     image_grid_thw = to_cuda(image_grid_thw)
     image_sizes = to_cuda(image_sizes)
     mm_token_type_ids = to_cuda(mm_token_type_ids)
+    pixel_values_videos = to_cuda(pixel_values_videos)
+    video_grid_thw = to_cuda(video_grid_thw)
 
     # ------------------------------------------------------------------
     # Greedy generation loop
@@ -250,7 +272,8 @@ def _disable_mtp(m):
 
             fwd_bwd_function = get_forward_backward_func()
             iterator = SingleBatchIterator(
-                input_ids, position_ids, None, pixel_values, image_grid_thw, image_sizes, mm_ids_padded
+                input_ids, position_ids, None, pixel_values, image_grid_thw, image_sizes, mm_ids_padded,
+                pixel_values_videos, video_grid_thw,
             )
 
             output = fwd_bwd_function(
@@ -323,7 +346,13 @@ def _disable_mtp(m):
         "--pp_layout", type=str, default=None, help="Pipeline model parallel layout (e.g. 'Et*15|t*15|t*16|t*15L')"
     )
     parser.add_argument("--megatron_model_path", type=str, default=None, help="Path to Megatron model checkpoint")
-    parser.add_argument("--image_path", type=str, default=None, help="Path or URL to image (optional).")
+    parser.add_argument("--image_path", type=str, default=None, help="Path or URL to a single image (optional).")
+    parser.add_argument("--image_paths", type=str, nargs="+", default=None,
+                        help="Paths to N image files in order (multi-image; Qwen-family only).")
+    parser.add_argument("--video_path", type=str, default=None,
+                        help="Path to a video file (Qwen-family only).")
+    parser.add_argument("--video_fps", type=float, default=2.0,
+                        help="Frames per second to sample from the video (default: 2.0).")
     parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code for HF model loading")
     args = parser.parse_args()
 
diff --git a/examples/conversion/vlm_generate_utils.py b/examples/conversion/vlm_generate_utils.py
index f0f0d7b280..8650daf0a9 100644
--- a/examples/conversion/vlm_generate_utils.py
+++ b/examples/conversion/vlm_generate_utils.py
@@ -122,6 +122,62 @@ def to_cuda(x):
     return x.cuda()
 
 
+def process_multi_image_inputs(processor, image_paths: list[str], prompt: str):
+    """Process N ordered images + prompt into model inputs (Qwen-family).
+
+    Returns:
+        (input_ids, pixel_values, image_grid_thw)
+    """
+    if not _HAS_QWEN_VL_UTILS:
+        raise ImportError("qwen-vl-utils required: pip install qwen-vl-utils")
+    pils = [load_image(p).convert("RGB") for p in image_paths]
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image", "image": p} for p in pils]
+            + [{"type": "text", "text": prompt}],
+        }
+    ]
+    image_inputs, video_inputs = process_vision_info(messages)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+    return inputs.input_ids, inputs.get("pixel_values"), inputs.get("image_grid_thw")
+
+
+def process_video_inputs(processor, video_path: str, prompt: str, *, fps: float = 2.0):
+    """Process a video + prompt into model inputs (Qwen-family).
+
+    Frame decoding mirrors the Qwen3-VL training pipeline: fetch_video decodes at
+    ``fps``, then video_processor is called with do_sample_frames=False to use the
+    pre-decoded frames as-is.
+
+    Returns:
+        (input_ids, pixel_values_videos, video_grid_thw)
+    """
+    if not _HAS_QWEN_VL_UTILS:
+        raise ImportError("qwen-vl-utils required: pip install qwen-vl-utils")
+    from qwen_vl_utils import fetch_video
+
+    frames = fetch_video({"video": video_path, "fps": fps})
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "video"}, {"type": "text", "text": prompt}],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    text_inputs = processor(text=[text], padding=True, return_tensors="pt")
+    video_proc = processor.video_processor(videos=[frames], return_tensors="pt", do_sample_frames=False)
+    # processor(text=...) without videos produces a single <|video_pad|> placeholder (id 151656).
+    # Pre-expand to match actual vision feature count so PP send/recv shapes are correct.
+    input_ids = pre_expand_image_tokens(
+        text_inputs["input_ids"],
+        video_proc["video_grid_thw"],
+        image_token_id=151656,  # <|video_pad|> for Qwen-VL family
+    )
+    return input_ids, video_proc.get("pixel_values_videos"), video_proc.get("video_grid_thw")
+
+
 def process_image_inputs(
     processor,
     image_path: Optional[str],

From 9d1fed0caa83ff78769375e7f9fccbc996d9bc16 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Wed, 6 May 2026 08:55:56 -0700
Subject: [PATCH 03/12] style: fix ruff-format line-length violations flagged
 by CI

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 examples/conversion/hf_to_megatron_generate_vlm.py     | 10 +++++++++-
 .../recipes/qwen_vl/data/energon/task_encoder.py       |  8 ++++++--
 src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py        |  4 +++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py
index 7c53b9f084..87873c1b50 100644
--- a/examples/conversion/hf_to_megatron_generate_vlm.py
+++ b/examples/conversion/hf_to_megatron_generate_vlm.py
@@ -109,7 +109,15 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
         "position_ids": batch["position_ids"],
         "attention_mask": batch.get("attention_mask"),
     }
-    for key in ("pixel_values", "image_grid_thw", "image_sizes", "mm_token_type_ids", "pixel_values_videos", "video_grid_thw", "image_position_ids"):
+    for key in (
+        "pixel_values",
+        "image_grid_thw",
+        "image_sizes",
+        "mm_token_type_ids",
+        "pixel_values_videos",
+        "video_grid_thw",
+        "image_position_ids",
+    ):
         if key in batch:
             forward_args[key] = batch[key]
 
diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index ef33dc758b..2cbf0177ce 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -266,7 +266,9 @@ def encode_sample(self, sample: ChatMLSample):
                     total_visual_tokens,
                     self.max_visual_tokens,
                 )
-                print(f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {total_visual_tokens} visual tokens, which exceeds max_visual_tokens={self.max_visual_tokens}")
+                print(
+                    f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {total_visual_tokens} visual tokens, which exceeds max_visual_tokens={self.max_visual_tokens}"
+                )
                 raise SkipSample()
 
         # Normalize conversation to [{"role": ..., "content": ...}, ...]
@@ -346,7 +348,9 @@ def encode_sample(self, sample: ChatMLSample):
                 logging.warning(
                     f"Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
                 )
-                print(f"[DEBUG] (task_encoder.py) Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample.")
+                print(
+                    f"[DEBUG] (task_encoder.py) Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
+                )
                 # raise SkipSample()
         final_input_ids = np.zeros(target_length, dtype=input_ids.dtype)
         final_input_masks = final_input_ids.copy()
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
index d1e2a13da7..45c0a78dda 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -1168,5 +1168,7 @@ def qwen3_vl_8b_peft_energon_config(peft_scheme: str | PEFT = "lora") -> ConfigC
     """
     cfg = qwen3_vl_8b_peft_config(peft_scheme=peft_scheme)
     hf_path = "Qwen/Qwen3-VL-8B-Instruct"
-    cfg.dataset = _make_energon_dataset(hf_path, cfg.model.seq_length, cfg.train.micro_batch_size, cfg.train.global_batch_size)
+    cfg.dataset = _make_energon_dataset(
+        hf_path, cfg.model.seq_length, cfg.train.micro_batch_size, cfg.train.global_batch_size
+    )
     return cfg

From 67e6f2403b95ff51ecd57ae3fb3b10456880e418 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Wed, 6 May 2026 10:04:33 -0700
Subject: [PATCH 04/12] style: apply ruff-format reformats and remove debug
 prints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 .../conversion/hf_to_megatron_generate_vlm.py   | 17 +++++++++++------
 examples/conversion/vlm_generate_utils.py       |  3 +--
 .../qwen_vl/data/energon/task_encoder.py        | 11 +----------
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py
index 87873c1b50..9ea04880e4 100644
--- a/examples/conversion/hf_to_megatron_generate_vlm.py
+++ b/examples/conversion/hf_to_megatron_generate_vlm.py
@@ -376,12 +376,17 @@ def _disable_mtp(m):
     )
     parser.add_argument("--megatron_model_path", type=str, default=None, help="Path to Megatron model checkpoint")
     parser.add_argument("--image_path", type=str, default=None, help="Path or URL to a single image (optional).")
-    parser.add_argument("--image_paths", type=str, nargs="+", default=None,
-                        help="Paths to N image files in order (multi-image; Qwen-family only).")
-    parser.add_argument("--video_path", type=str, default=None,
-                        help="Path to a video file (Qwen-family only).")
-    parser.add_argument("--video_fps", type=float, default=2.0,
-                        help="Frames per second to sample from the video (default: 2.0).")
+    parser.add_argument(
+        "--image_paths",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Paths to N image files in order (multi-image; Qwen-family only).",
+    )
+    parser.add_argument("--video_path", type=str, default=None, help="Path to a video file (Qwen-family only).")
+    parser.add_argument(
+        "--video_fps", type=float, default=2.0, help="Frames per second to sample from the video (default: 2.0)."
+    )
     parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code for HF model loading")
     args = parser.parse_args()
 
diff --git a/examples/conversion/vlm_generate_utils.py b/examples/conversion/vlm_generate_utils.py
index 5a5e383311..db47fcf670 100644
--- a/examples/conversion/vlm_generate_utils.py
+++ b/examples/conversion/vlm_generate_utils.py
@@ -144,8 +144,7 @@ def process_multi_image_inputs(processor, image_paths: list[str], prompt: str):
     messages = [
         {
             "role": "user",
-            "content": [{"type": "image", "image": p} for p in pils]
-            + [{"type": "text", "text": prompt}],
+            "content": [{"type": "image", "image": p} for p in pils] + [{"type": "text", "text": prompt}],
         }
     ]
     image_inputs, video_inputs = process_vision_info(messages)
diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index 2cbf0177ce..12c1e4ed4b 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -57,8 +57,6 @@ def process_vision(
         image_grid_thw = None
 
     if videos is not None:
-        # DEBUGGING
-        # videos_inputs = processor(images=None, text="", videos=videos, return_tensors="pt")  
         # Pre-decoded frames from WDS are already at the desired sampling rate.
         # do_sample_frames=False prevents the processor from re-sampling them under
         # a spurious 24 fps assumption, which would reduce most clips to T=2.
@@ -223,7 +221,6 @@ def encode_sample(self, sample: ChatMLSample):
                     len(imgs_for_processing),
                     self.max_num_images,
                 )
-                print(f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {len(imgs_for_processing)} images, which exceeds max_num_images={self.max_num_images}")
                 raise SkipSample()
 
         if self.max_num_frames is not None and videos_for_processing is not None:
@@ -266,9 +263,6 @@ def encode_sample(self, sample: ChatMLSample):
                     total_visual_tokens,
                     self.max_visual_tokens,
                 )
-                print(
-                    f"[DEBUG] (task_encoder.py) Skipping sample {sample.__key__} because it has {total_visual_tokens} visual tokens, which exceeds max_visual_tokens={self.max_visual_tokens}"
-                )
                 raise SkipSample()
 
         # Normalize conversation to [{"role": ..., "content": ...}, ...]
@@ -348,10 +342,7 @@ def encode_sample(self, sample: ChatMLSample):
                 logging.warning(
                     f"Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
                 )
-                print(
-                    f"[DEBUG] (task_encoder.py) Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
-                )
-                # raise SkipSample()
+                raise SkipSample()
         final_input_ids = np.zeros(target_length, dtype=input_ids.dtype)
         final_input_masks = final_input_ids.copy()
 

From c1555c0961c46f3eb47d053cd9f97e8cce9d3cf8 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Wed, 6 May 2026 10:13:38 -0700
Subject: [PATCH 05/12] style: fix remaining ruff-format violations in
 task_encoder.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 .../recipes/qwen_vl/data/energon/task_encoder.py    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index 12c1e4ed4b..67c86c5f77 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -39,7 +39,6 @@
 from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
 
 
-
 def process_vision(
     processor, images, videos, fps=None, model_version: str = "qwen-vl", min_pixels=None, max_pixels=None
 ):
@@ -233,7 +232,9 @@ def encode_sample(self, sample: ChatMLSample):
                         self.max_num_frames,
                         sample.__key__,
                     )
-                    print(f"[DEBUG] (task_encoder.py) Truncating {len(v)} frames to max_num_frames={self.max_num_frames} for sample {sample.__key__}")
+                    print(
+                        f"[DEBUG] (task_encoder.py) Truncating {len(v)} frames to max_num_frames={self.max_num_frames} for sample {sample.__key__}"
+                    )
                     clipped.append(v[: self.max_num_frames])
                 else:
                     clipped.append(v)
@@ -252,8 +253,12 @@ def encode_sample(self, sample: ChatMLSample):
         flattened_videos = processed_vision["video_inputs"]
 
         merge_length = self.merge_size**2
-        image_tokens = int(image_thw_grids.prod(dim=-1).sum().item()) // merge_length if image_thw_grids is not None else 0
-        video_tokens = int(video_thw_grids.prod(dim=-1).sum().item()) // merge_length if video_thw_grids is not None else 0
+        image_tokens = (
+            int(image_thw_grids.prod(dim=-1).sum().item()) // merge_length if image_thw_grids is not None else 0
+        )
+        video_tokens = (
+            int(video_thw_grids.prod(dim=-1).sum().item()) // merge_length if video_thw_grids is not None else 0
+        )
         total_visual_tokens = image_tokens + video_tokens
         if self.max_visual_tokens is not None:
             if total_visual_tokens > self.max_visual_tokens:

From 859eb867bbfe5f2d8d98413be1334723d3868e18 Mon Sep 17 00:00:00 2001
From: Huy Vu <huvu@nvidia.com>
Date: Thu, 7 May 2026 08:22:59 -0700
Subject: [PATCH 06/12] [recipe] test: Update QwenVL task encoder test mocks
 for torch tensors

Adapts the unit tests to the refactored encoder which now computes
visual-token counts via .prod(dim=-1) (torch syntax) on the processor's
image_grid_thw / video_grid_thw outputs. The mocks previously returned
np.array, causing TypeError. Also bumps max_padding_length to 512 so
the expanded sequence length stays within seq_len and avoids the new
SkipSample() path.

Signed-off-by: Huy Vu <huvu@nvidia.com>
---
 .../recipes/qwen_vl/data/energon/test_task_encoder.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
index bb4e83abbd..d6e019f643 100644
--- a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
+++ b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
@@ -164,7 +164,7 @@ def setUp(self):
         self.encoder = QwenVLTaskEncoder(
             tokenizer=self.tokenizer,
             image_processor=self.image_processor,
-            max_padding_length=128,
+            max_padding_length=512,
             patch_size=14,
             spatial_merge_size=2,
         )
@@ -184,10 +184,10 @@ def test_encode_sample(self):
         def processor_side_effect(images=None, videos=None, **kwargs):
             res = {}
             if images:
-                res["image_grid_thw"] = np.array([[1, 28, 28]])  # 1 tile, 28x28
+                res["image_grid_thw"] = torch.tensor([[1, 28, 28]])  # 1 tile, 28x28
                 res["pixel_values"] = torch.randn(1, 3, 28, 28)
             if videos:
-                res["video_grid_thw"] = np.array([[1, 28, 28]])
+                res["video_grid_thw"] = torch.tensor([[1, 28, 28]])
                 res["pixel_values_videos"] = torch.randn(1, 3, 28, 28)
             return res
 
@@ -243,10 +243,10 @@ def test_encode_sample_from_value_format(self):
         def processor_side_effect(images=None, videos=None, **kwargs):
             res = {}
             if images:
-                res["image_grid_thw"] = np.array([[1, 28, 28]])
+                res["image_grid_thw"] = torch.tensor([[1, 28, 28]])
                 res["pixel_values"] = torch.randn(1, 3, 28, 28)
             if videos:
-                res["video_grid_thw"] = np.array([[1, 28, 28]])
+                res["video_grid_thw"] = torch.tensor([[1, 28, 28]])
                 res["pixel_values_videos"] = torch.randn(1, 3, 28, 28)
             return res
 

From bca268d8e5096fc6486487cf64e649e844da8c4c Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Thu, 7 May 2026 08:38:38 -0700
Subject: [PATCH 07/12] [docs, recipe] docs: Document Qwen3-VL visual token
 budget controls

Adds README section describing the three composable controls that bound
GPU cost per sample (min/max_pixels, max_num_images/max_num_frames,
max_visual_tokens) and asserts the PEFT energon recipe defaults so the
documented contract is enforced by tests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 examples/models/vlm/qwen3_vl/README.md                    | 5 +++++
 tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md
index 38817cff96..21f6c22c3e 100644
--- a/examples/models/vlm/qwen3_vl/README.md
+++ b/examples/models/vlm/qwen3_vl/README.md
@@ -127,6 +127,11 @@ field_map:
 
 Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [peft_energon.sh](peft_energon.sh) and run the script.
 
+#### Controlling visual tokens computation budget
+Three independent CLI-overridable controls bound a sample's GPU cost. They compose:
+- **`dataset.min_pixels` / `dataset.max_pixels`** — image/frame resolutions lower and upper bound (defaults `200704` / `1003520`). 
+- **`dataset.max_num_images` / `dataset.max_num_frames`** - limit count of images/frames (defaults `10` / `60`). Too many images → sample is dropped. Too many frames → frame list truncated.
+- **`dataset.max_visual_tokens`** — limit total visual tokens across all images and frames in a sample, computed post-rescaling as `prod(T,H,W) // merge_size²` (default `None` = disabled). Catches cases the other two miss (few images at high resolution, or many at low resolution). Exceeding samples are dropped.
 
 ### Expected Training Dynamics
 We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/lczz4ixx) for the expected loss curves and grad norms.
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
index 4df792df2c..a101800a1c 100644
--- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
@@ -574,7 +574,9 @@ def test_qwen3_vl_8b_peft_energon_task_encoder(monkeypatch: pytest.MonkeyPatch):
     assert isinstance(cfg.dataset.task_encoder, QwenVLTaskEncoder)
     assert cfg.dataset.min_pixels == 200704
     assert cfg.dataset.max_pixels == 1003520
-
+    assert cfg.dataset.max_num_images == 10
+    assert cfg.dataset.max_num_frames == 60
+    assert cfg.dataset.max_visual_tokens == 16384
 
 # =============================================================================
 # Qwen3-VL Pretrain Mock Config Tests

From 3b48e4e0d9afd5c426f8a39a166a45c70f797da8 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Thu, 7 May 2026 08:45:30 -0700
Subject: [PATCH 08/12] [recipe] fix: Add missing blank line before
 module-level comment block

Pre-commit / ruff format requires two blank lines between a function and
the following module-level block.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
index a101800a1c..d1974c8f4d 100644
--- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
@@ -578,6 +578,7 @@ def test_qwen3_vl_8b_peft_energon_task_encoder(monkeypatch: pytest.MonkeyPatch):
     assert cfg.dataset.max_num_frames == 60
     assert cfg.dataset.max_visual_tokens == 16384
 
+
 # =============================================================================
 # Qwen3-VL Pretrain Mock Config Tests
 # =============================================================================

From 84274773c7f461f0690f69eebd021c2adb7b38a9 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Thu, 7 May 2026 14:19:02 -0700
Subject: [PATCH 09/12] [recipe] test: Add unit tests for QwenVL task encoder
 limits, provider config sync, and visual inputs video reshape

Covers three pieces of recently added behavior:
- Per-sample budget limits in QwenVLTaskEncoder (max_num_images skip,
  max_num_frames truncation, default values).
- QwenVLEnergonProvider.build_datasets propagating CLI-overridable knobs
  onto the task encoder before delegating to the parent.
- Qwen2_5_VLVisualInputs.normalized_for_model handling video tensors and
  mixed image/video shapes, including already-flat passthrough.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 .../qwen_vl/data/energon/test_task_encoder.py | 208 ++++++++++++++++++
 .../qwen_vl/test_qwen_vl_energon_provider.py  | 156 +++++++++++++
 .../training/utils/test_visual_inputs.py      |  72 ++++++
 3 files changed, 436 insertions(+)
 create mode 100644 tests/unit_tests/recipes/qwen_vl/test_qwen_vl_energon_provider.py

diff --git a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
index d6e019f643..f2e4d2371f 100644
--- a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
+++ b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import torch
+from megatron.energon import SkipSample
 from PIL import Image
 
 from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import (
@@ -361,5 +362,212 @@ def test_encode_batch(self):
         self.assertNotIn("__subflavors__", encoded_dict)
 
 
+class TestQwenVLTaskEncoderLimits(unittest.TestCase):
+    """Tests for the per-sample budget limits added to QwenVLTaskEncoder."""
+
+    def setUp(self):
+        self.tokenizer = MagicMock()
+        self.tokenizer.pad_token_id = 0
+        self.tokenizer.eos_token_id = 1
+        self.tokenizer.image_token_id = 151655
+        self.tokenizer.video_token_id = 151656
+        self.tokenizer.convert_tokens_to_ids.side_effect = lambda x: {
+            "<image>": 151655,
+            "<video>": 151656,
+        }.get(x, 10)
+        self.image_processor = MagicMock()
+
+    def _make_encoder(self, **kwargs):
+        defaults = dict(
+            tokenizer=self.tokenizer,
+            image_processor=self.image_processor,
+            max_padding_length=512,
+            patch_size=14,
+            spatial_merge_size=2,
+        )
+        defaults.update(kwargs)
+        return QwenVLTaskEncoder(**defaults)
+
+    def _make_sample(self, *, n_images=0, n_videos=0, frames_per_video=0, conversation_text="Look <image>"):
+        imgs = [Image.new("RGB", (4, 4), color="red") for _ in range(n_images)] or None
+        if n_videos:
+            videos = [
+                [Image.new("RGB", (4, 4), color="blue") for _ in range(frames_per_video)] for _ in range(n_videos)
+            ]
+        else:
+            videos = None
+        return ChatMLSample(
+            __key__="key",
+            __restore_key__="restore_key",
+            __subflavor__={},
+            __subflavors__={},
+            imgs=imgs,
+            videos=videos,
+            conversation=json.dumps(
+                [
+                    {"role": "user", "content": conversation_text},
+                    {"role": "assistant", "content": "Nice"},
+                ]
+            ),
+        )
+
+    def test_default_limits_set_on_init(self):
+        enc = self._make_encoder()
+        self.assertEqual(enc.max_num_images, 10)
+        self.assertEqual(enc.max_num_frames, 60)
+        self.assertEqual(enc.max_visual_tokens, 16384)
+
+    def test_init_accepts_none_to_disable_limits(self):
+        enc = self._make_encoder(max_num_images=None, max_num_frames=None, max_visual_tokens=None)
+        self.assertIsNone(enc.max_num_images)
+        self.assertIsNone(enc.max_num_frames)
+        self.assertIsNone(enc.max_visual_tokens)
+
+    def test_max_num_images_skip_when_exceeded(self):
+        # Configure the processor so that, IF we got to it, encoding would succeed.
+        # The point of this test is that we should *not* get to it.
+        self.image_processor.side_effect = AssertionError("processor must not be called when sample is skipped")
+
+        enc = self._make_encoder(max_num_images=2)
+        sample = self._make_sample(n_images=3)
+        with self.assertRaises(SkipSample):
+            enc.encode_sample(sample)
+
+    def test_max_num_images_none_disables_check(self):
+        # Even with many images, no SkipSample should be raised here — we expect the
+        # call to fail later inside processing instead, which is fine for this test.
+        enc = self._make_encoder(max_num_images=None, max_visual_tokens=None)
+        sample = self._make_sample(n_images=50)
+        # A non-SkipSample error is acceptable; we only assert SkipSample is NOT raised.
+        with self.assertRaises(Exception) as cm:
+            enc.encode_sample(sample)
+        self.assertNotIsInstance(cm.exception, SkipSample)
+
+    def test_max_num_frames_truncates_in_place(self):
+        # process_vision routes videos through processor.video_processor, so we
+        # capture the videos arg there.
+        captured = {}
+
+        def video_processor_side_effect(videos=None, **kwargs):
+            captured["videos"] = videos
+            return {
+                "video_grid_thw": torch.tensor([[1, 14, 14]]),
+                "pixel_values_videos": torch.randn(1, 3, 14, 14),
+            }
+
+        self.image_processor.video_processor.side_effect = video_processor_side_effect
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 151656, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Nice" else [999]
+
+        enc = self._make_encoder(max_num_frames=4, max_visual_tokens=None)
+        sample = self._make_sample(n_videos=1, frames_per_video=10, conversation_text="Watch <video>")
+        enc.encode_sample(sample)
+
+        # videos arg passed to processor should have a single clip of 4 frames.
+        self.assertIsNotNone(captured["videos"])
+        self.assertEqual(len(captured["videos"]), 1)
+        self.assertEqual(len(captured["videos"][0]), 4)
+
+    def test_max_num_frames_keeps_short_videos_intact(self):
+        captured = {}
+
+        def video_processor_side_effect(videos=None, **kwargs):
+            captured["videos"] = videos
+            return {
+                "video_grid_thw": torch.tensor([[1, 14, 14]]),
+                "pixel_values_videos": torch.randn(1, 3, 14, 14),
+            }
+
+        self.image_processor.video_processor.side_effect = video_processor_side_effect
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 151656, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Nice" else [999]
+
+        enc = self._make_encoder(max_num_frames=10, max_visual_tokens=None)
+        sample = self._make_sample(n_videos=1, frames_per_video=3, conversation_text="Watch <video>")
+        enc.encode_sample(sample)
+
+        self.assertEqual(len(captured["videos"][0]), 3)
+
+    def test_max_visual_tokens_skip_when_exceeded(self):
+        # 1 image of grid (1, 28, 28); merge_length = 2**2 = 4 -> 1*28*28/4 = 196 tokens.
+        def processor_side_effect(images=None, videos=None, **kwargs):
+            return {
+                "image_grid_thw": torch.tensor([[1, 28, 28]]),
+                "pixel_values": torch.randn(1, 3, 28, 28),
+            }
+
+        self.image_processor.side_effect = processor_side_effect
+
+        enc = self._make_encoder(max_visual_tokens=100)  # 196 > 100 -> skip
+        sample = self._make_sample(n_images=1)
+        with self.assertRaises(SkipSample):
+            enc.encode_sample(sample)
+
+    def test_max_visual_tokens_none_disables_check(self):
+        # 196 visual tokens — would trigger the limit at 100, but None disables it.
+        def processor_side_effect(images=None, videos=None, **kwargs):
+            return {
+                "image_grid_thw": torch.tensor([[1, 28, 28]]),
+                "pixel_values": torch.randn(1, 3, 28, 28),
+            }
+
+        self.image_processor.side_effect = processor_side_effect
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 151655, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Nice" else [999]
+
+        enc = self._make_encoder(max_visual_tokens=None)
+        sample = self._make_sample(n_images=1)
+        encoded = enc.encode_sample(sample)
+        self.assertIsInstance(encoded, QwenVLTaskSample)
+
+    def test_visual_tokens_exceeding_seq_len_raises_skip(self):
+        # The post-expansion length depends on visual tokens, so a small seq_len
+        # paired with many visual tokens triggers the SkipSample branch.
+        def processor_side_effect(images=None, videos=None, **kwargs):
+            return {
+                "image_grid_thw": torch.tensor([[1, 28, 28]]),  # 196 visual tokens
+                "pixel_values": torch.randn(1, 3, 28, 28),
+            }
+
+        self.image_processor.side_effect = processor_side_effect
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 151655, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Nice" else [999]
+
+        # max_visual_tokens=None to bypass that earlier guard, seq_len small so the
+        # later branch fires.
+        enc = self._make_encoder(max_padding_length=50, max_visual_tokens=None)
+        sample = self._make_sample(n_images=1)
+        with self.assertRaises(SkipSample):
+            enc.encode_sample(sample)
+
+
+class TestProcessVisionVideoBranch(unittest.TestCase):
+    """Verify that process_vision routes videos through processor.video_processor with do_sample_frames=False."""
+
+    def test_video_processor_called_with_do_sample_frames_false(self):
+        processor = MagicMock()
+        processor.video_processor = MagicMock(
+            return_value={
+                "video_grid_thw": torch.tensor([[1, 14, 14]]),
+                "pixel_values_videos": torch.randn(1, 3, 14, 14),
+            }
+        )
+
+        videos = [[Image.new("RGB", (4, 4), color="blue") for _ in range(3)]]
+        res = process_vision(processor, images=None, videos=videos)
+
+        # Top-level processor must NOT be called for videos in this path.
+        processor.assert_not_called()
+        # video_processor must be called with do_sample_frames=False.
+        processor.video_processor.assert_called_once()
+        kwargs = processor.video_processor.call_args.kwargs
+        self.assertIs(kwargs.get("do_sample_frames"), False)
+        self.assertEqual(kwargs.get("return_tensors"), "pt")
+        self.assertIs(kwargs.get("videos"), videos)
+
+        self.assertIn("video_grid_thw", res)
+        self.assertIsNotNone(res["video_grid_thw"])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen_vl_energon_provider.py b/tests/unit_tests/recipes/qwen_vl/test_qwen_vl_energon_provider.py
new file mode 100644
index 0000000000..a813cc30d9
--- /dev/null
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen_vl_energon_provider.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Test purpose:
+# - Verify QwenVLEnergonProvider.build_datasets propagates dataset-config knobs
+#   (seq_length, min_pixels, max_pixels, max_num_images, max_num_frames,
+#   max_visual_tokens) onto the task encoder *before* delegating to the parent.
+#   The propagation step is what makes these fields CLI-overridable for users.
+#
+
+import pytest
+
+from megatron.bridge.recipes.qwen_vl.qwen3_vl import QwenVLEnergonProvider
+
+
+class _FakeTaskEncoder:
+    """Minimal stand-in for QwenVLTaskEncoder — only carries the synced fields."""
+
+    def __init__(self):
+        # Initialize to sentinel values that differ from the provider defaults so we
+        # can confirm the assignments happened.
+        self.seq_len = -1
+        self.seq_length = -1
+        self.min_pixels = -1
+        self.max_pixels = -1
+        self.max_num_images = -1
+        self.max_num_frames = -1
+        self.max_visual_tokens = -1
+
+
+def _make_provider(task_encoder, **overrides):
+    """Construct a QwenVLEnergonProvider with safe defaults for unit tests."""
+    defaults = dict(
+        path="/tmp/fake-energon-path",
+        seq_length=4096,
+        micro_batch_size=1,
+        global_batch_size=1,
+        num_workers=0,
+        task_encoder=task_encoder,
+    )
+    defaults.update(overrides)
+    return QwenVLEnergonProvider(**defaults)
+
+
+@pytest.fixture
+def fake_context():
+    # build_datasets is stubbed before reaching the real parent, so a None-like
+    # context is acceptable; we only need an object the override path won't touch.
+    return object()
+
+
+def test_build_datasets_syncs_all_fields_to_task_encoder(monkeypatch, fake_context):
+    encoder = _FakeTaskEncoder()
+    provider = _make_provider(
+        encoder,
+        seq_length=2048,
+        min_pixels=12345,
+        max_pixels=67890,
+        max_num_images=4,
+        max_num_frames=16,
+        max_visual_tokens=999,
+    )
+
+    # Stub the parent so build_datasets returns immediately after the sync block.
+    captured = {}
+
+    def fake_super_build(self, context):
+        captured["called_with"] = context
+        return "stubbed"
+
+    # Patch the parent's build_datasets in-place; restored automatically by monkeypatch.
+    from megatron.bridge.data.energon.energon_provider import EnergonProvider
+
+    monkeypatch.setattr(EnergonProvider, "build_datasets", fake_super_build)
+
+    result = provider.build_datasets(fake_context)
+
+    # Parent was invoked (so super().build_datasets ran) and got the right context.
+    assert result == "stubbed"
+    assert captured["called_with"] is fake_context
+
+    # Every overridable field is now reflected on the encoder.
+    assert encoder.seq_len == 2048
+    assert encoder.seq_length == 2048
+    assert encoder.min_pixels == 12345
+    assert encoder.max_pixels == 67890
+    assert encoder.max_num_images == 4
+    assert encoder.max_num_frames == 16
+    assert encoder.max_visual_tokens == 999
+
+
+def test_build_datasets_no_op_when_task_encoder_is_none(monkeypatch, fake_context):
+    provider = _make_provider(task_encoder=None)
+
+    from megatron.bridge.data.energon.energon_provider import EnergonProvider
+
+    monkeypatch.setattr(EnergonProvider, "build_datasets", lambda self, context: "stubbed")
+
+    # Should not raise even though task_encoder is None.
+    result = provider.build_datasets(fake_context)
+    assert result == "stubbed"
+
+
+def test_provider_default_field_values():
+    """Defaults should match the documented per-sample budget."""
+    provider = _make_provider(task_encoder=_FakeTaskEncoder())
+    assert provider.min_pixels == 200704
+    assert provider.max_pixels == 1003520
+    assert provider.max_num_images == 10
+    assert provider.max_num_frames == 60
+    assert provider.max_visual_tokens == 16384
+
+
+def test_provider_accepts_none_for_unbounded_limits():
+    """None should be accepted for the optional limit fields (disables the budget)."""
+    provider = _make_provider(
+        task_encoder=_FakeTaskEncoder(),
+        max_num_images=None,
+        max_num_frames=None,
+        max_visual_tokens=None,
+    )
+    assert provider.max_num_images is None
+    assert provider.max_num_frames is None
+    assert provider.max_visual_tokens is None
+
+
+def test_build_datasets_propagates_none_limits(monkeypatch, fake_context):
+    encoder = _FakeTaskEncoder()
+    provider = _make_provider(
+        encoder,
+        max_num_images=None,
+        max_num_frames=None,
+        max_visual_tokens=None,
+    )
+
+    from megatron.bridge.data.energon.energon_provider import EnergonProvider
+
+    monkeypatch.setattr(EnergonProvider, "build_datasets", lambda self, context: "stubbed")
+
+    provider.build_datasets(fake_context)
+
+    assert encoder.max_num_images is None
+    assert encoder.max_num_frames is None
+    assert encoder.max_visual_tokens is None
diff --git a/tests/unit_tests/training/utils/test_visual_inputs.py b/tests/unit_tests/training/utils/test_visual_inputs.py
index 5d63202402..e4e9c79e6d 100644
--- a/tests/unit_tests/training/utils/test_visual_inputs.py
+++ b/tests/unit_tests/training/utils/test_visual_inputs.py
@@ -36,3 +36,75 @@ def test_as_model_kwargs_filters_none():
     vi = Qwen2_5_VLVisualInputs(pixel_values=None, image_grid_thw=None)
     kwargs = vi.as_model_kwargs()
     assert kwargs == {}
+
+
+def test_normalized_for_model_video_shapes():
+    # pixel_values_videos: [B, N, C, H, W] -> [B*N, C, H, W]
+    pixel_values_videos = torch.randn(2, 4, 3, 8, 8)
+    # video_grid_thw: [B, N, 3] -> [B*N, 3]
+    video_grid_thw = torch.randint(0, 10, (2, 4, 3))
+
+    vi = Qwen2_5_VLVisualInputs(
+        pixel_values_videos=pixel_values_videos,
+        video_grid_thw=video_grid_thw,
+    )
+    kwargs = vi.normalized_for_model()
+
+    assert kwargs["pixel_values_videos"].shape == (2 * 4, 3, 8, 8)
+    assert kwargs["video_grid_thw"].shape == (2 * 4, 3)
+    # Image-only fields stay absent when None.
+    assert "pixel_values" not in kwargs
+    assert "image_grid_thw" not in kwargs
+
+
+def test_normalized_for_model_mixed_image_and_video():
+    pixel_values = torch.randn(2, 1, 3, 4, 4)
+    pixel_values_videos = torch.randn(2, 2, 3, 4, 4)
+    image_grid_thw = torch.randint(0, 10, (2, 1, 3))
+    video_grid_thw = torch.randint(0, 10, (2, 2, 3))
+
+    vi = Qwen2_5_VLVisualInputs(
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+    )
+    kwargs = vi.normalized_for_model()
+
+    assert kwargs["pixel_values"].shape == (2, 3, 4, 4)
+    assert kwargs["pixel_values_videos"].shape == (4, 3, 4, 4)
+    assert kwargs["image_grid_thw"].shape == (2, 3)
+    assert kwargs["video_grid_thw"].shape == (4, 3)
+
+
+def test_normalized_for_model_already_flat_passthrough():
+    # When tensors are already in the flat shape (dim != 5 / dim != 3),
+    # normalized_for_model should leave them untouched.
+    pixel_values = torch.randn(6, 3, 4, 4)  # already [B*N, C, H, W]
+    image_grid_thw = torch.randint(0, 10, (6, 3))  # already [B*N, 3]
+    pixel_values_videos = torch.randn(8, 3, 4, 4)
+    video_grid_thw = torch.randint(0, 10, (8, 3))
+
+    vi = Qwen2_5_VLVisualInputs(
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+    )
+    kwargs = vi.normalized_for_model()
+
+    assert kwargs["pixel_values"].shape == (6, 3, 4, 4)
+    assert kwargs["pixel_values_videos"].shape == (8, 3, 4, 4)
+    assert kwargs["image_grid_thw"].shape == (6, 3)
+    assert kwargs["video_grid_thw"].shape == (8, 3)
+
+
+def test_as_model_kwargs_includes_video_fields():
+    pixel_values_videos = torch.randn(1, 1, 3, 2, 2)
+    video_grid_thw = torch.randint(0, 4, (1, 1, 3))
+    vi = Qwen2_5_VLVisualInputs(
+        pixel_values_videos=pixel_values_videos,
+        video_grid_thw=video_grid_thw,
+    )
+    kwargs = vi.as_model_kwargs()
+    assert set(kwargs.keys()) == {"pixel_values_videos", "video_grid_thw"}

From 05bbdd14e6cf0defa6a3abce927784638b90062d Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@nvidia.com>
Date: Thu, 7 May 2026 21:58:36 -0700
Subject: [PATCH 10/12] [ckpt] test: Add unit tests for VLM generate utils
 multi-image and video paths

Cover process_multi_image_inputs and process_video_inputs in
examples/conversion/vlm_generate_utils.py, including the qwen-vl-utils
ImportError fallback and the success paths with mocked processors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@nvidia.com>
---
 .../utils/test_vlm_generate_utils.py          | 230 ++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 tests/unit_tests/utils/test_vlm_generate_utils.py

diff --git a/tests/unit_tests/utils/test_vlm_generate_utils.py b/tests/unit_tests/utils/test_vlm_generate_utils.py
new file mode 100644
index 0000000000..1426e99ea1
--- /dev/null
+++ b/tests/unit_tests/utils/test_vlm_generate_utils.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``process_multi_image_inputs`` and ``process_video_inputs``
+in ``examples.conversion.vlm_generate_utils``.
+
+Covers the ``ImportError`` fallback when ``qwen_vl_utils`` is unavailable
+and the success paths with mocked processors and ``qwen_vl_utils`` helpers.
+"""
+
+import importlib.util
+import os
+import sys
+from unittest import mock
+
+import pytest
+import torch
+
+
+# Load examples/conversion/vlm_generate_utils.py directly from its file path.
+# This avoids ambiguity when another `examples` package (e.g. from Megatron-LM)
+# shadows the local one on sys.path.
+_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+_VLM_GEN_UTILS_PATH = os.path.join(_REPO_ROOT, "examples", "conversion", "vlm_generate_utils.py")
+_spec = importlib.util.spec_from_file_location("vlm_generate_utils_under_test", _VLM_GEN_UTILS_PATH)
+vlm_generate_utils = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(vlm_generate_utils)
+
+
+@pytest.mark.unit
+class TestProcessMultiImageInputs:
+    """Tests for ``process_multi_image_inputs``."""
+
+    def test_raises_import_error_when_qwen_vl_utils_missing(self):
+        """Without qwen_vl_utils, the function must raise ImportError early."""
+        with mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", False):
+            with pytest.raises(ImportError, match="qwen-vl-utils required"):
+                vlm_generate_utils.process_multi_image_inputs(mock.MagicMock(), ["/a.png"], "describe")
+
+    def test_processes_images_and_returns_tuple(self):
+        """Multi-image path: load each image, run process_vision_info, then call processor."""
+        rgb_a, rgb_b = mock.MagicMock(name="rgb_a"), mock.MagicMock(name="rgb_b")
+        img_a, img_b = mock.MagicMock(name="img_a"), mock.MagicMock(name="img_b")
+        img_a.convert.return_value = rgb_a
+        img_b.convert.return_value = rgb_b
+        load_image_mock = mock.MagicMock(side_effect=[img_a, img_b])
+
+        proc = mock.MagicMock()
+        proc.apply_chat_template.return_value = "TEMPLATED"
+        proc_call_result = mock.MagicMock()
+        proc_call_result.input_ids = torch.tensor([[1, 2, 3]])
+        proc_call_result.get.side_effect = lambda key: {
+            "pixel_values": "PIXELS",
+            "image_grid_thw": "GRID",
+        }.get(key)
+        proc.return_value = proc_call_result
+
+        process_vision_info_mock = mock.MagicMock(return_value=("IMG_INPUTS", "VID_INPUTS"))
+
+        with (
+            mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
+            mock.patch.object(vlm_generate_utils, "load_image", load_image_mock),
+            mock.patch.object(
+                vlm_generate_utils, "process_vision_info", process_vision_info_mock, create=True
+            ),
+        ):
+            input_ids, pixel_values, image_grid_thw = vlm_generate_utils.process_multi_image_inputs(
+                proc, ["/a.png", "/b.png"], "describe these"
+            )
+
+        assert load_image_mock.call_count == 2
+        load_image_mock.assert_any_call("/a.png")
+        load_image_mock.assert_any_call("/b.png")
+        img_a.convert.assert_called_once_with("RGB")
+        img_b.convert.assert_called_once_with("RGB")
+
+        (sent_messages,), _ = process_vision_info_mock.call_args
+        assert sent_messages[0]["role"] == "user"
+        contents = sent_messages[0]["content"]
+        assert {"type": "image", "image": rgb_a} in contents
+        assert {"type": "image", "image": rgb_b} in contents
+        assert {"type": "text", "text": "describe these"} in contents
+
+        proc.apply_chat_template.assert_called_once_with(
+            sent_messages, tokenize=False, add_generation_prompt=True
+        )
+        proc.assert_called_once_with(
+            text=["TEMPLATED"],
+            images="IMG_INPUTS",
+            videos="VID_INPUTS",
+            padding=True,
+            return_tensors="pt",
+        )
+
+        torch.testing.assert_close(input_ids, torch.tensor([[1, 2, 3]]))
+        assert pixel_values == "PIXELS"
+        assert image_grid_thw == "GRID"
+
+    def test_returns_none_when_processor_omits_optional_fields(self):
+        """When processor output lacks pixel_values / image_grid_thw, those fields must be None."""
+        img = mock.MagicMock()
+        img.convert.return_value = mock.MagicMock(name="rgb")
+        load_image_mock = mock.MagicMock(return_value=img)
+
+        proc = mock.MagicMock()
+        proc.apply_chat_template.return_value = "TEMPLATED"
+        proc_call_result = mock.MagicMock()
+        proc_call_result.input_ids = torch.tensor([[5]])
+        proc_call_result.get.return_value = None
+        proc.return_value = proc_call_result
+
+        with (
+            mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
+            mock.patch.object(vlm_generate_utils, "load_image", load_image_mock),
+            mock.patch.object(
+                vlm_generate_utils, "process_vision_info", return_value=([], []), create=True
+            ),
+        ):
+            ids, px, grid = vlm_generate_utils.process_multi_image_inputs(proc, ["/x.png"], "p")
+
+        assert px is None
+        assert grid is None
+        torch.testing.assert_close(ids, torch.tensor([[5]]))
+
+
+@pytest.mark.unit
+class TestProcessVideoInputs:
+    """Tests for ``process_video_inputs``."""
+
+    def test_raises_import_error_when_qwen_vl_utils_missing(self):
+        """Without qwen_vl_utils, the function must raise ImportError early."""
+        with mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", False):
+            with pytest.raises(ImportError, match="qwen-vl-utils required"):
+                vlm_generate_utils.process_video_inputs(mock.MagicMock(), "/v.mp4", "describe")
+
+    def test_video_path_pre_expands_video_tokens(self):
+        """Video path: fetch_video → processor → video_processor → pre-expand <|video_pad|>."""
+        # Single <|video_pad|> placeholder (id 151656). With grid_thw=[1, 4, 4] and
+        # spatial_merge_size=2, pre_expand_image_tokens expands it to 1*(4//2)*(4//2)=4 tokens.
+        text_input_ids = torch.tensor([[100, 151656, 200]])
+        video_grid_thw = torch.tensor([[1, 4, 4]])
+
+        proc = mock.MagicMock()
+        proc.apply_chat_template.return_value = "VID_TEMPLATED"
+        proc.return_value = {"input_ids": text_input_ids}
+        proc.video_processor.return_value = {
+            "video_grid_thw": video_grid_thw,
+            "pixel_values_videos": "VID_PIXELS",
+        }
+
+        fake_qvu = mock.MagicMock()
+        fake_qvu.fetch_video = mock.MagicMock(return_value="DECODED_FRAMES")
+
+        with (
+            mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
+            mock.patch.dict(sys.modules, {"qwen_vl_utils": fake_qvu}),
+        ):
+            input_ids, pixel_values_videos, grid = vlm_generate_utils.process_video_inputs(
+                proc, "/clip.mp4", "what is happening", fps=3.0
+            )
+
+        fake_qvu.fetch_video.assert_called_once_with({"video": "/clip.mp4", "fps": 3.0})
+        proc.apply_chat_template.assert_called_once()
+        proc.assert_called_once_with(text=["VID_TEMPLATED"], padding=True, return_tensors="pt")
+        proc.video_processor.assert_called_once_with(
+            videos=["DECODED_FRAMES"], return_tensors="pt", do_sample_frames=False
+        )
+
+        expected = torch.tensor([[100, 151656, 151656, 151656, 151656, 200]])
+        torch.testing.assert_close(input_ids, expected)
+        assert pixel_values_videos == "VID_PIXELS"
+        torch.testing.assert_close(grid, video_grid_thw)
+
+    def test_default_fps_is_2(self):
+        """Default fps should be 2.0 when not specified."""
+        proc = mock.MagicMock()
+        proc.apply_chat_template.return_value = "T"
+        proc.return_value = {"input_ids": torch.tensor([[1]])}
+        proc.video_processor.return_value = {
+            "video_grid_thw": torch.tensor([[1, 2, 2]]),
+            "pixel_values_videos": "P",
+        }
+
+        fake_qvu = mock.MagicMock()
+        fake_qvu.fetch_video = mock.MagicMock(return_value="F")
+
+        with (
+            mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
+            mock.patch.dict(sys.modules, {"qwen_vl_utils": fake_qvu}),
+        ):
+            vlm_generate_utils.process_video_inputs(proc, "/v.mp4", "p")
+
+        fake_qvu.fetch_video.assert_called_once_with({"video": "/v.mp4", "fps": 2.0})
+
+    def test_video_message_structure(self):
+        """The user message must contain a video placeholder followed by the prompt text."""
+        proc = mock.MagicMock()
+        proc.apply_chat_template.return_value = "T"
+        proc.return_value = {"input_ids": torch.tensor([[1]])}
+        proc.video_processor.return_value = {
+            "video_grid_thw": torch.tensor([[1, 2, 2]]),
+            "pixel_values_videos": "P",
+        }
+
+        fake_qvu = mock.MagicMock()
+        fake_qvu.fetch_video = mock.MagicMock(return_value="F")
+
+        with (
+            mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
+            mock.patch.dict(sys.modules, {"qwen_vl_utils": fake_qvu}),
+        ):
+            vlm_generate_utils.process_video_inputs(proc, "/v.mp4", "narrate this")
+
+        (sent_messages,), kw = proc.apply_chat_template.call_args
+        assert sent_messages[0]["role"] == "user"
+        content = sent_messages[0]["content"]
+        assert {"type": "video"} in content
+        assert {"type": "text", "text": "narrate this"} in content
+        assert kw == {"tokenize": False, "add_generation_prompt": True}

From c7041466e9b11866233e513fb4b4150c2c1f0763 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 8 May 2026 11:01:43 -0700
Subject: [PATCH 11/12] fix lint

---
 tests/unit_tests/utils/test_vlm_generate_utils.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/unit_tests/utils/test_vlm_generate_utils.py b/tests/unit_tests/utils/test_vlm_generate_utils.py
index 1426e99ea1..300ad3867f 100644
--- a/tests/unit_tests/utils/test_vlm_generate_utils.py
+++ b/tests/unit_tests/utils/test_vlm_generate_utils.py
@@ -71,9 +71,7 @@ def test_processes_images_and_returns_tuple(self):
         with (
             mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
             mock.patch.object(vlm_generate_utils, "load_image", load_image_mock),
-            mock.patch.object(
-                vlm_generate_utils, "process_vision_info", process_vision_info_mock, create=True
-            ),
+            mock.patch.object(vlm_generate_utils, "process_vision_info", process_vision_info_mock, create=True),
         ):
             input_ids, pixel_values, image_grid_thw = vlm_generate_utils.process_multi_image_inputs(
                 proc, ["/a.png", "/b.png"], "describe these"
@@ -92,9 +90,7 @@ def test_processes_images_and_returns_tuple(self):
         assert {"type": "image", "image": rgb_b} in contents
         assert {"type": "text", "text": "describe these"} in contents
 
-        proc.apply_chat_template.assert_called_once_with(
-            sent_messages, tokenize=False, add_generation_prompt=True
-        )
+        proc.apply_chat_template.assert_called_once_with(sent_messages, tokenize=False, add_generation_prompt=True)
         proc.assert_called_once_with(
             text=["TEMPLATED"],
             images="IMG_INPUTS",
@@ -123,9 +119,7 @@ def test_returns_none_when_processor_omits_optional_fields(self):
         with (
             mock.patch.object(vlm_generate_utils, "_HAS_QWEN_VL_UTILS", True),
             mock.patch.object(vlm_generate_utils, "load_image", load_image_mock),
-            mock.patch.object(
-                vlm_generate_utils, "process_vision_info", return_value=([], []), create=True
-            ),
+            mock.patch.object(vlm_generate_utils, "process_vision_info", return_value=([], []), create=True),
         ):
             ids, px, grid = vlm_generate_utils.process_multi_image_inputs(proc, ["/x.png"], "p")
 

From 1fde94608b2642a165201d1ed5347f17d18834af Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Tue, 12 May 2026 07:20:07 -0700
Subject: [PATCH 12/12] [recipe, data, docs] fix: address review feedback on
 Qwen-VL energon task encoder

- README: correct max_visual_tokens default (16384, None disables).
- task_encoder: drop bare print debug; restore warning for text-only
  overflow so non-visual SkipSample path is still observable; rewrite
  visual-token count with positional prod(-1).sum() so it works on both
  torch and numpy thw grids.
- test_task_encoder: revert mock thw grids to np.array now that the
  encoder is array-agnostic.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 examples/models/vlm/qwen3_vl/README.md        |  2 +-
 .../qwen_vl/data/energon/task_encoder.py      | 24 +++++++++++--------
 .../qwen_vl/data/energon/test_task_encoder.py |  8 +++----
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md
index 21f6c22c3e..5ec4391cd2 100644
--- a/examples/models/vlm/qwen3_vl/README.md
+++ b/examples/models/vlm/qwen3_vl/README.md
@@ -131,7 +131,7 @@ Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [peft
 Three independent CLI-overridable controls bound a sample's GPU cost. They compose:
 - **`dataset.min_pixels` / `dataset.max_pixels`** — image/frame resolutions lower and upper bound (defaults `200704` / `1003520`). 
 - **`dataset.max_num_images` / `dataset.max_num_frames`** - limit count of images/frames (defaults `10` / `60`). Too many images → sample is dropped. Too many frames → frame list truncated.
-- **`dataset.max_visual_tokens`** — limit total visual tokens across all images and frames in a sample, computed post-rescaling as `prod(T,H,W) // merge_size²` (default `None` = disabled). Catches cases the other two miss (few images at high resolution, or many at low resolution). Exceeding samples are dropped.
+- **`dataset.max_visual_tokens`** — limit total visual tokens across all images and frames in a sample, computed post-rescaling as `prod(T,H,W) // merge_size²` (default `16384`; set to `None` to disable). Catches cases the other two miss (few images at high resolution, or many at low resolution). Exceeding samples are dropped.
 
 ### Expected Training Dynamics
 We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/lczz4ixx) for the expected loss curves and grad norms.
diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index 67c86c5f77..f2cb0d5556 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -232,9 +232,6 @@ def encode_sample(self, sample: ChatMLSample):
                         self.max_num_frames,
                         sample.__key__,
                     )
-                    print(
-                        f"[DEBUG] (task_encoder.py) Truncating {len(v)} frames to max_num_frames={self.max_num_frames} for sample {sample.__key__}"
-                    )
                     clipped.append(v[: self.max_num_frames])
                 else:
                     clipped.append(v)
@@ -253,12 +250,8 @@ def encode_sample(self, sample: ChatMLSample):
         flattened_videos = processed_vision["video_inputs"]
 
         merge_length = self.merge_size**2
-        image_tokens = (
-            int(image_thw_grids.prod(dim=-1).sum().item()) // merge_length if image_thw_grids is not None else 0
-        )
-        video_tokens = (
-            int(video_thw_grids.prod(dim=-1).sum().item()) // merge_length if video_thw_grids is not None else 0
-        )
+        image_tokens = int(image_thw_grids.prod(-1).sum()) // merge_length if image_thw_grids is not None else 0
+        video_tokens = int(video_thw_grids.prod(-1).sum()) // merge_length if video_thw_grids is not None else 0
         total_visual_tokens = image_tokens + video_tokens
         if self.max_visual_tokens is not None:
             if total_visual_tokens > self.max_visual_tokens:
@@ -345,9 +338,20 @@ def encode_sample(self, sample: ChatMLSample):
         if target_length > self.seq_len:
             if total_visual_tokens > self.seq_len:
                 logging.warning(
-                    f"Long sequence with length {target_length} and visual tokens {total_visual_tokens} exceeds seq_len={self.seq_len}, truncation will affect visual tokens, dropping sample."
+                    "Sample %s: target_length=%d with visual_tokens=%d exceeds seq_len=%d; "
+                    "truncation would corrupt visual tokens, dropping sample.",
+                    sample.__key__,
+                    target_length,
+                    total_visual_tokens,
+                    self.seq_len,
                 )
                 raise SkipSample()
+            logging.warning(
+                "Sample %s: target_length=%d exceeds seq_len=%d; text will be truncated.",
+                sample.__key__,
+                target_length,
+                self.seq_len,
+            )
         final_input_ids = np.zeros(target_length, dtype=input_ids.dtype)
         final_input_masks = final_input_ids.copy()
 
diff --git a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
index f2e4d2371f..fc3e5cd143 100644
--- a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
+++ b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
@@ -185,10 +185,10 @@ def test_encode_sample(self):
         def processor_side_effect(images=None, videos=None, **kwargs):
             res = {}
             if images:
-                res["image_grid_thw"] = torch.tensor([[1, 28, 28]])  # 1 tile, 28x28
+                res["image_grid_thw"] = np.array([[1, 28, 28]])  # 1 tile, 28x28
                 res["pixel_values"] = torch.randn(1, 3, 28, 28)
             if videos:
-                res["video_grid_thw"] = torch.tensor([[1, 28, 28]])
+                res["video_grid_thw"] = np.array([[1, 28, 28]])
                 res["pixel_values_videos"] = torch.randn(1, 3, 28, 28)
             return res
 
@@ -244,10 +244,10 @@ def test_encode_sample_from_value_format(self):
         def processor_side_effect(images=None, videos=None, **kwargs):
             res = {}
             if images:
-                res["image_grid_thw"] = torch.tensor([[1, 28, 28]])
+                res["image_grid_thw"] = np.array([[1, 28, 28]])
                 res["pixel_values"] = torch.randn(1, 3, 28, 28)
             if videos:
-                res["video_grid_thw"] = torch.tensor([[1, 28, 28]])
+                res["video_grid_thw"] = np.array([[1, 28, 28]])
                 res["pixel_values_videos"] = torch.randn(1, 3, 28, 28)
             return res