Skip to content

Commit

Permalink
Divide the loss by update_freq when using gradient accumulation. (fac…
Browse files Browse the repository at this point in the history
…ebookresearch#1833)

* Follow fairseq and divide the loss by update_freq when using gradient accumulation.

* Change update_freq default.
  • Loading branch information
stephenroller authored Jul 11, 2019
1 parent c27c073 commit 2a0184a
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion parlai/core/torch_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def add_cmdline_args(cls, argparser):
lr_group.add_argument(
'--update-freq',
type=int,
default=-1,
default=1,
hidden=True,
help='Accumulate gradients N times before performing an optimizer.step().',
)
Expand Down Expand Up @@ -1679,6 +1679,10 @@ def backward(self, loss):
loss.backward(), for integration with distributed training and FP16
training.
"""
if self.opt.get('update_freq', 1) > 1:
# gradient accumulation, but still need to average across the minibatches
loss = loss / self.opt['update_freq']

if self.fp16:
self.optimizer.backward(loss, update_master_grads=False)
else:
Expand Down

0 comments on commit 2a0184a

Please sign in to comment.