Skip to content

Commit

Permalink
Better assertion, and set no_sync only when PP is 1
Browse files Browse the repository at this point in the history
  • Loading branch information
deepakn94 committed Sep 5, 2023
1 parent cb2b887 commit 9abd8cf
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
4 changes: 3 additions & 1 deletion megatron/model/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ def done(self):
if not self.overlap_grad_reduce:
self.all_reduce()
return
assert self.allreduce_handle is not None, 'allreduce is not issued for this bucket'
assert self.allreduce_handle is not None, \
(f'allreduce is not issued for this bucket, '
f'{len(self.params_with_grad)}/{len(self.params)} grads available')
self.allreduce_handle.wait()
self.allreduce_handle = None

Expand Down
3 changes: 2 additions & 1 deletion megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
# TODO: Remove this once we move LocalDDP to Core.
if len(model) == 1 and isinstance(model[0], LocalDDP):
if len(model) == 1 and isinstance(model[0], LocalDDP) and \
args.pipeline_model_parallel_size == 1:
config.no_sync_func = model[0].no_sync

timers('interval-time', log_level=0).start(barrier=True)
Expand Down

0 comments on commit 9abd8cf

Please sign in to comment.