diff --git a/ci_scripts/api_white_list.txt b/ci_scripts/api_white_list.txt index 0e3f706166d..ab542386f0c 100644 --- a/ci_scripts/api_white_list.txt +++ b/ci_scripts/api_white_list.txt @@ -7,6 +7,8 @@ paddle/optimizer/Dpsgd_cn.rst paddle/reader/ComposeNotAligned_cn.rst paddle/fluid/layers/scatter_cn.rst paddle/tensor/manipulation/scatter_cn.rst +paddle/distributed/init_parallel_env_cn.rst +paddle/distributed/spawn_cn.rst paddle/distributed/fleet/Fleet_cn.rst paddle/distributed/fleet/utils/fs/ExecuteError_cn.rst paddle/distributed/fleet/utils/fs/FSFileExistsError_cn.rst @@ -16,5 +18,6 @@ paddle/distributed/fleet/utils/fs/FSTimeOut_cn.rst paddle/distributed/fleet/utils/fs/FS_cn.rst paddle/distributed/fleet/utils/fs/HDFSClient_cn.rst paddle/distributed/fleet/utils/fs/LocalFS_cn.rst +paddle/fluid/dygraph/parallel/DataParallel_cn.rst paddle/fluid/dygraph/parallel/ParallelEnv_cn.rst upgrade_guide_cn.md diff --git a/doc/paddle/api/paddle/distributed/fleet/Fleet_cn.rst b/doc/paddle/api/paddle/distributed/fleet/Fleet_cn.rst index 228829950be..bf49ddafead 100644 --- a/doc/paddle/api/paddle/distributed/fleet/Fleet_cn.rst +++ b/doc/paddle/api/paddle/distributed/fleet/Fleet_cn.rst @@ -420,9 +420,7 @@ server节点的运行, 此命令会将ParameterServer的进程启动并常驻直 print("loss:", loss.numpy()) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() @@ -651,9 +649,7 @@ server节点的运行, 此命令会将ParameterServer的进程启动并常驻直 print("loss:", loss.numpy()) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() @@ -716,9 +712,7 @@ server节点的运行, 此命令会将ParameterServer的进程启动并常驻直 print("loss:", loss.numpy()) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() diff --git a/doc/paddle/api/paddle/distributed/init_parallel_env_cn.rst b/doc/paddle/api/paddle/distributed/init_parallel_env_cn.rst index eafe9f10f05..69f446b6562 100644 --- a/doc/paddle/api/paddle/distributed/init_parallel_env_cn.rst +++ b/doc/paddle/api/paddle/distributed/init_parallel_env_cn.rst @@ -53,9 +53,7 @@ init_parallel_env labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() diff --git a/doc/paddle/api/paddle/distributed/spawn_cn.rst b/doc/paddle/api/paddle/distributed/spawn_cn.rst index 21f8f762f50..f9ca3589f54 100644 --- a/doc/paddle/api/paddle/distributed/spawn_cn.rst +++ b/doc/paddle/api/paddle/distributed/spawn_cn.rst @@ -64,9 +64,7 @@ spawn if print_result is True: print("loss:", loss.numpy()) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() diff --git a/doc/paddle/api/paddle/fluid/dygraph/parallel/DataParallel_cn.rst b/doc/paddle/api/paddle/fluid/dygraph/parallel/DataParallel_cn.rst index ff0d6821ec8..99bee4a9494 100644 --- a/doc/paddle/api/paddle/fluid/dygraph/parallel/DataParallel_cn.rst +++ b/doc/paddle/api/paddle/fluid/dygraph/parallel/DataParallel_cn.rst @@ -69,133 +69,7 @@ DataParallel labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() - - adam.step() - adam.clear_grad() - - if __name__ == '__main__': - # 1. start by ``paddle.distributed.spawn`` (default) - dist.spawn(train, nprocs=2) - # 2. start by ``paddle.distributed.launch`` - # train() - -.. py:method:: scale_loss(loss) - -缩放模型损失值 ``loss`` 。在数据并行模式中,损失值 ``loss`` 需要根据并行训练进程的数目进行缩放。 - -如果不在数据并行模式下,会直接返回原 ``loss`` 。 - -参数: - - **loss** (Variable) - 当前模型的损失值。 - -返回:缩放后的损失值 ``loss`` - -返回类型:Variable - -**代码示例** - -.. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - import paddle.distributed as dist - - class LinearNet(nn.Layer): - def __init__(self): - super(LinearNet, self).__init__() - self._linear1 = nn.Linear(10, 10) - self._linear2 = nn.Linear(10, 1) - - def forward(self, x): - return self._linear2(self._linear1(x)) - - def train(): - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize parallel environment - dist.init_parallel_env() - - # 3. create data parallel layer & optimizer - layer = LinearNet() - dp_layer = paddle.DataParallel(layer) - - loss_fn = nn.MSELoss() - adam = opt.Adam( - learning_rate=0.001, parameters=dp_layer.parameters()) - - # 4. run layer - inputs = paddle.randn([10, 10], 'float32') - outputs = dp_layer(inputs) - labels = paddle.randn([10, 1], 'float32') - loss = loss_fn(outputs, labels) - - loss = dp_layer.scale_loss(loss) - loss.backward() - dp_layer.apply_collective_grads() - - adam.step() - adam.clear_grad() - - if __name__ == '__main__': - # 1. start by ``paddle.distributed.spawn`` (default) - dist.spawn(train, nprocs=2) - # 2. start by ``paddle.distributed.launch`` - # train() - - -.. py:method:: apply_collective_grads() - -AllReduce(规约)参数的梯度值。 - -返回:无 - -**代码示例** - -.. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - import paddle.distributed as dist - - class LinearNet(nn.Layer): - def __init__(self): - super(LinearNet, self).__init__() - self._linear1 = nn.Linear(10, 10) - self._linear2 = nn.Linear(10, 1) - - def forward(self, x): - return self._linear2(self._linear1(x)) - - def train(): - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize parallel environment - dist.init_parallel_env() - - # 3. create data parallel layer & optimizer - layer = LinearNet() - dp_layer = paddle.DataParallel(layer) - - loss_fn = nn.MSELoss() - adam = opt.Adam( - learning_rate=0.001, parameters=dp_layer.parameters()) - - # 4. run layer - inputs = paddle.randn([10, 10], 'float32') - outputs = dp_layer(inputs) - labels = paddle.randn([10, 1], 'float32') - loss = loss_fn(outputs, labels) - - loss = dp_layer.scale_loss(loss) - loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad() diff --git a/doc/paddle/beginners_guide/dygraph/DyGraph.md b/doc/paddle/beginners_guide/dygraph/DyGraph.md index 29dbb786d4b..4f35c97bd08 100644 --- a/doc/paddle/beginners_guide/dygraph/DyGraph.md +++ b/doc/paddle/beginners_guide/dygraph/DyGraph.md @@ -426,9 +426,7 @@ for epoch in range(epoch_num): loss = paddle.nn.functional.cross_entropy(cost, label) avg_loss = paddle.mean(loss) - avg_loss = mnist.scale_loss(avg_loss) avg_loss.backward() - mnist.apply_collective_grads() adam.minimize(avg_loss) mnist.clear_gradients() @@ -477,7 +475,7 @@ trainers_endpoints: 127.0.0.1:6170,127.0.0.1:6171 , node_id: 0 , current_node_ip 总结一下,多卡训练相比单卡训练,有如下步骤不同: 1. 通过 ParallelEnv() 的 dev_id 设置程序运行的设备。 ``` -place = paddle.CUDAPlace(paddle.imperative.ParallelEnv().dev_id) +place = paddle.CUDAPlace(paddle.imperative.ParallelEnv().device_id) paddle.enable_imperative(place): ``` 2. 准备多卡环境。 @@ -497,23 +495,14 @@ mnist = paddle.imperative.DataParallel(mnist, strategy) ``` train_reader = paddle.incubate.reader.distributed_batch_reader(train_reader) ``` - -5. 单步训练。 - -首先对 loss 进行归一化,然后计算单卡的梯度,最终将所有的梯度聚合。 -``` -avg_loss = mnist.scale_loss(avg_loss) -avg_loss.backward() -mnist.apply_collective_grads() -``` -6. 模型保存。 +5. 模型保存。 和单卡不同,多卡训练时需逐个进程执行保存操作,多个进程同时保存会使模型文件格式出错。 ``` -if paddle.imperative.ParallelEnv().local_rank == 0: +if paddle.imperative.ParallelEnv().rank == 0: paddle.imperative.save(mnist.state_dict(), "worker_0") ``` -7. 评估测试。 +6. 评估测试。 对模型进行评估测试时,如果需要加载模型,须确保评估和保存的操作在同一个进程中,否则可能出现模型尚未保存完成,即启动评估,造成加载出错的问题。如果不需要加载模型,则没有这个问题,在一个进程或多个进程中评估均可。 diff --git a/doc/paddle/guides/01_paddle2.0_introduction/upgrade_guide_cn.md b/doc/paddle/guides/01_paddle2.0_introduction/upgrade_guide_cn.md index 68179704157..89bb3263acc 100644 --- a/doc/paddle/guides/01_paddle2.0_introduction/upgrade_guide_cn.md +++ b/doc/paddle/guides/01_paddle2.0_introduction/upgrade_guide_cn.md @@ -339,12 +339,7 @@ def train(): predicts = net(x_data) acc = paddle.metric.accuracy(predicts, y_data, k=2) avg_acc = paddle.mean(acc) loss = paddle.nn.functional.cross_entropy(predicts, y_data) - - # 第3处改动,归一化loss - avg_loss = net.scale_loss(avg_loss) avg_loss.backward() - # 第4处改动,同步梯度 - net.apply_collective_grads() if batch_id % 100 == 0: print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, avg_loss.numpy(), avg_acc.numpy())) adam.step()