Add fp16 support

chhzh123 · chhzh123 · commit ee9e72fc8e10 · 2023-06-12T05:55:23.000Z
diff --git a/slapo/verify.py b/slapo/verify.py
@@ -115,6 +115,8 @@ def __exit__(self, *exc):
             is_pipeline = True
         else:
             is_pipeline = False
+        if "dtype" in self.kwargs:
+            logger.info("Using %s data type", self.kwargs["dtype"], ranks=0)
         # 1. Build the original model with random weights
         named_params = self.original_sch.mod.named_parameters()
         is_initialized = named_params.__next__()[1].device != torch.device("meta")
@@ -126,6 +128,8 @@ def __exit__(self, *exc):
         )
         #    make sure all the buffers are on the right device
         original_mod = original_mod.to(self.device)
+        #    with the correct data type
+        original_mod = original_mod.to(self.kwargs.get("dtype", torch.float32))
         # 2. Get the example inputs and outputs
         #    Broadcast the example inputs from rank 0 in each TP/PP group
         #    to other ranks in the same group.
@@ -253,7 +257,9 @@ def init_weights(mod, path):
             new_mod, _ = build(new_sch, init_weights=init_weights)
         # 8. Run the new model
         #    make sure all the buffers are on the right device
-        new_mod.to(self.device)
+        new_mod = new_mod.to(self.device)
+        #    with the correct data type
+        new_mod = new_mod.to(self.kwargs.get("dtype", torch.float32))
         if self.eval_mode:
             new_mod.eval()
         #    make sure the random seeds are the same, which may affect the output of dropout
@@ -292,6 +298,7 @@ def init_weights(mod, path):
             # HF model may output shape-0 tensors for loss
             if self.loss_fn is not None and new_output.shape != original_output.shape:
                 new_output = new_output.view(original_output.shape)
+            new_output = new_output.to(original_output.dtype)
             torch.testing.assert_close(original_output, new_output)
         logger.info("Passed verification!")
         if not is_copy_failed:
diff --git a/tests/test_ds_pipeline.py b/tests/test_ds_pipeline.py
@@ -324,12 +324,12 @@ def loss_fn(outputs, labels):
     ds_config_dict = get_ds_config(
         batch_size=bs,
         micro_batch_size_per_gpu=micro_bs,
-        fp16=False,
+        fp16=True,
     )
     device = "cuda"
     input_ids = torch.ones(micro_bs, seq_len, dtype=torch.long, device=device)
     attention_mask = torch.ones(
-        micro_bs, seq_len, dtype=torch.float32, requires_grad=False, device=device
+        micro_bs, seq_len, dtype=torch.float16, requires_grad=False, device=device
     )
     position_ids = torch.ones(
         micro_bs, seq_len, dtype=torch.long, requires_grad=False, device=device
@@ -346,6 +346,7 @@ def loss_fn(outputs, labels):
         topology=topology,
         config=ds_config_dict,
         init_weights=model._init_weights,
+        dtype=torch.float16,
     ):
         sch.trace_until(
             "transformer", tracer="huggingface", concrete_args=concrete_args
@@ -414,12 +415,12 @@ def loss_fn(outputs, labels):
     ds_config_dict = get_ds_config(
         batch_size=bs,
         micro_batch_size_per_gpu=micro_bs,
-        fp16=False,
+        fp16=True,
     )
     device = "cuda"
     input_ids = torch.ones(micro_bs, seq_len, dtype=torch.long, device=device)
     attention_mask = torch.ones(
-        micro_bs, seq_len, dtype=torch.float32, requires_grad=False, device=device
+        micro_bs, seq_len, dtype=torch.float16, requires_grad=False, device=device
     )
     position_ids = torch.ones(
         micro_bs, seq_len, dtype=torch.long, requires_grad=False, device=device
@@ -436,6 +437,7 @@ def loss_fn(outputs, labels):
         topology=topology,
         config=ds_config_dict,
         init_weights=model._init_weights,
+        dtype=torch.float16,
     ):
         sch.trace_until(
             "transformer", tracer="huggingface", concrete_args=concrete_args