Skip to content

Commit

Permalink
fix minor bug where we have to scale the loss to account for gradient…
Browse files Browse the repository at this point in the history
… accumulation, which sums before backprop. note that this is not a major bug because AdamW is scale invariant. however, this did affect gradient clipping
  • Loading branch information
karpathy committed Apr 13, 2023
1 parent a82b33b commit 553f949
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
master_process = True
seed_offset = 0
gradient_accumulation_steps *= 8 # simulate 8 gpus
print("total number of tokens per iteration:", batch_size * block_size * gradient_accumulation_steps)

if master_process:
os.makedirs(out_dir, exist_ok=True)
Expand Down Expand Up @@ -287,6 +288,7 @@ def get_lr(it):
model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
with ctx:
logits, loss = model(X, Y)
loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
# immediately async prefetch next batch while model is doing the forward pass on the GPU
X, Y = get_batch('train')
# backward pass, with gradient scaling if training in fp16
Expand All @@ -306,7 +308,9 @@ def get_lr(it):
dt = t1 - t0
t0 = t1
if iter_num % log_interval == 0 and master_process:
lossf = loss.item() # loss as float. note: this is a CPU-GPU sync point
# get loss as float. note: this is a CPU-GPU sync point
# scale up to undo the division above, approximating the true total loss (exact would have been a sum)
lossf = loss.item() * gradient_accumulation_steps
if local_iter_num >= 5: # let the training loop settle a bit
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
Expand Down

0 comments on commit 553f949

Please sign in to comment.