NVIDIA · wanna-01 · May 7, 2026
@@ -898,7 +898,7 @@ def pretrain(
     ft_integration.setup()
     timestamp_after_in_job_setup = time.time()
 
-    # Initalize and get arguments, timers, and Tensorboard writer.
+    # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(
         get_embedding_ranks=get_embedding_ranks,
         get_position_embedding_ranks=get_position_embedding_ranks,
@@ -1960,7 +1960,7 @@ def _save_state_dict(attr_name, label):
     if save_params_in_this_iteration:
         _save_state_dict(attr_name="data", label="params")
 
-    # when freezing sub-models we may have a mixture of successful and unsucessful ranks,
+    # when freezing sub-models we may have a mixture of successful and unsuccessful ranks,
     # so we must gather across mp ranks
     update_successful = logical_and_across_model_parallel_group(update_successful)
     # grad_norm and num_zeros_in_grad will be None on ranks without trainable params,