Better assertion, and set no_sync only when PP is 1

chen-yy20 · Sep 5, 2023 · 9abd8cf · 9abd8cf
1 parent cb2b887
commit 9abd8cf
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 2 deletions.
diff --git a/megatron/model/distributed.py b/megatron/model/distributed.py
@@ -93,7 +93,9 @@ def done(self):
         if not self.overlap_grad_reduce:
             self.all_reduce()
             return
-        assert self.allreduce_handle is not None, 'allreduce is not issued for this bucket'
+        assert self.allreduce_handle is not None, \
+            (f'allreduce is not issued for this bucket, '
+             f'{len(self.params_with_grad)}/{len(self.params)} grads available')
         self.allreduce_handle.wait()
         self.allreduce_handle = None
 

diff --git a/megatron/training.py b/megatron/training.py
@@ -708,7 +708,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
     config.grad_scale_func = optimizer.scale_loss
     config.timers = timers
     # TODO: Remove this once we move LocalDDP to Core.
-    if len(model) == 1 and isinstance(model[0], LocalDDP):
+    if len(model) == 1 and isinstance(model[0], LocalDDP) and \
+        args.pipeline_model_parallel_size == 1:
         config.no_sync_func = model[0].no_sync
 
     timers('interval-time', log_level=0).start(barrier=True)