diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch index 45be723..c33be44 100644 --- a/gpt3/Megatron-LM.patch +++ b/gpt3/Megatron-LM.patch @@ -1,5 +1,5 @@ diff --git a/megatron/arguments.py b/megatron/arguments.py -index ae42b83..f427bc5 100644 +index ae42b83e..df50f67f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): @@ -10,7 +10,7 @@ index ae42b83..f427bc5 100644 # Custom arguments. if extra_args_provider is not None: -@@ -1306,3 +1307,10 @@ def _add_vision_args(parser): +@@ -1306,3 +1307,19 @@ def _add_vision_args(parser): help='warmup teacher temperaure epochs') return parser @@ -20,10 +20,18 @@ index ae42b83..f427bc5 100644 + group = parser.add_argument_group(title="msamp") + group.add_argument('--msamp', action='store_true', default=False, + help='whether to enable msamp') ++ # Auto scaling factor tuning for FP8 collective communication ++ group.add_argument('--wgrad-auto-scaling', action='store_true', default=False, ++ help='whether to enable auto scaling factor tuning on weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10, ++ help='the frequency of checking whether overflow exists in the result of weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-3, ++ help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-window', type=int, default=100, ++ help='the window size for auto scaling factor tuning on weight gradients reduction') + return parser -\ No newline at end of file diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py -index e88b585..320f2b2 100644 +index e88b5851..320f2b24 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -106,6 +106,59 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False, @@ -406,7 +414,7 @@ index e88b585..320f2b2 100644 only_context_model=False, custom_load_path=None): """ diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py -index a86444c..600f49d 100644 +index a86444cc..600f49d8 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -439,7 +439,9 @@ def linear_with_grad_accumulation_and_async_allreduce( @@ -532,7 +540,7 @@ index a86444c..600f49d 100644 """Forward of RowParallelLinear diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py -index 484e9b3..e85984d 100644 +index 484e9b32..e85984d7 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -5,10 +5,12 @@ from apex.optimizers import FusedSGD as SGD @@ -577,7 +585,7 @@ index 484e9b3..e85984d 100644 optimizer = SGD(param_groups, lr=args.lr, diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py -index da9cd70..414fd88 100644 +index da9cd70f..414fd887 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -13,13 +13,15 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors @@ -599,7 +607,7 @@ index da9cd70..414fd88 100644 def _zero_grad_group_helper(group, set_to_none): """Zero out the gradient for a group of parameters. diff --git a/megatron/training.py b/megatron/training.py -index b821ae7..99a7fad 100644 +index b821ae7b..99a7fadb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -33,7 +33,7 @@ from megatron.initialize import initialize_megatron