Divide the loss by update_freq when using gradient accumulation. (fac…

…ebookresearch#1833) * Follow fairseq and divide the loss by update_freq when using gradient accumulation. * Change update_freq default.
ErikEkstedt · Jul 11, 2019 · 2a0184a · 2a0184a
1 parent c27c073
commit 2a0184a
Showing 1 changed file with 5 additions and 1 deletion.
diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
@@ -537,7 +537,7 @@ def add_cmdline_args(cls, argparser):
         lr_group.add_argument(
             '--update-freq',
             type=int,
-            default=-1,
+            default=1,
             hidden=True,
             help='Accumulate gradients N times before performing an optimizer.step().',
         )
@@ -1679,6 +1679,10 @@ def backward(self, loss):
         loss.backward(), for integration with distributed training and FP16
         training.
         """
+        if self.opt.get('update_freq', 1) > 1:
+            # gradient accumulation, but still need to average across the minibatches
+            loss = loss / self.opt['update_freq']
+
         if self.fp16:
             self.optimizer.backward(loss, update_master_grads=False)
         else: