@@ -1162,6 +1162,11 @@ def _train_loop(
1162
1162
# batch duration measurements when using timer callbacks.
1163
1163
self .callback (lambda c : c .on_batch_end (self , progress_tracker , save_path , sync_step = should_step ))
1164
1164
1165
+ # If this is the last batch in the epoch, increment before running evaluation so that metrics are reported
1166
+ # with the correct epoch.
1167
+ if batcher .last_batch ():
1168
+ progress_tracker .epoch += 1
1169
+
1165
1170
if progress_tracker .steps % final_steps_per_checkpoint == 0 :
1166
1171
if not self .skip_all_evaluation :
1167
1172
# Publishes metrics to MLFLow if there are any MLFlow callbacks.
@@ -1188,7 +1193,7 @@ def _train_loop(
1188
1193
# Checkpoint the model.
1189
1194
# NOTE: Ideally we would do this before evaluation, but for some reason DeepSpeed will complain
1190
1195
# about inflight params if we do that, which is why we checkpoint after eval instead. In practice,
1191
- # this should not make a difference, xcept in the unlikely event an error occurs during eval and we
1196
+ # this should not make a difference, except in the unlikely event an error occurs during eval and we
1192
1197
# want to resume from the last checkpoint, in which case we will lose slightly more progress this way.
1193
1198
if not self .skip_save_progress :
1194
1199
checkpoint_manager .save (progress_tracker .steps )
@@ -1197,7 +1202,6 @@ def _train_loop(
1197
1202
1198
1203
# If this was the last batch, then increment the epoch counter and invoke the `on_epoch_end` callback.
1199
1204
if batcher .last_batch ():
1200
- progress_tracker .epoch += 1
1201
1205
self .callback (lambda c : c .on_epoch_end (self , progress_tracker , save_path ))
1202
1206
1203
1207
return should_break
0 commit comments