From 1f1353fe48f09007f1c2571b9548954f6af2053f Mon Sep 17 00:00:00 2001 From: qiyuw Date: Wed, 6 May 2026 16:10:25 -0700 Subject: [PATCH] fix eval issue Signed-off-by: qiyuw --- src/megatron/bridge/training/train.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py index f16f3a59e3..e02ab376b2 100644 --- a/src/megatron/bridge/training/train.py +++ b/src/megatron/bridge/training/train.py @@ -593,6 +593,14 @@ def train( if energy_monitor is not None: energy_monitor.pause() timers("interval-time").stop() + if config.optimizer.reuse_grad_buf_for_mxfp8_param_ag and config.ddp.overlap_param_gather: + # disable_forward_pre_hook(param_sync=True) below force-syncs params for eval. + # Copy the main params to param buffer before the forced AllGather. + for model_chunk in model: + model_chunk.zero_grad_buffer() + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance._copy_main_params_to_param_buffer() if should_toggle_forward_pre_hook: disable_forward_pre_hook(model) pre_hook_enabled = False