Skip to content

Commit 5ddced5

Browse files
authored
Merge pull request PaddlePaddle#97 from shengxiangwang/xpu/all_reduce
add extra all_reduce for xpu
2 parents ee4a3ce + c4160e0 commit 5ddced5

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

ppdet/engine/trainer.py

+6
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,12 @@ def train(self, validate=False):
422422
# model forward
423423
outputs = model(data)
424424
loss = outputs['loss']
425+
426+
# avoid some all_reduce timeout due to computation progress differs between xpu cards
427+
if self._nranks > 1 and self.cfg.use_xpu:
428+
tensor_for_all_reduce = paddle.to_tensor(1.0)
429+
paddle.distributed.all_reduce(tensor_for_all_reduce)
430+
425431
# model backward
426432
loss.backward()
427433
self.optimizer.step()

0 commit comments

Comments
 (0)