forked from flashlight/flashlight
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDistributedTraining.cpp
103 lines (81 loc) · 2.72 KB
/
DistributedTraining.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <iostream>
#include <flashlight/flashlight.h>
using namespace fl;
int main() {
af::info();
fl::distributedInit(
fl::DistributedInit::MPI,
-1, // worldRank - unused. Automatically derived from `MPI_Comm_Rank`
-1, // worldRank - unused. Automatically derived from `MPI_Comm_Size`
{{fl::DistributedConstants::kMaxDevicePerNode, "8"}} // param
);
auto worldSize = fl::getWorldSize();
auto worldRank = fl::getWorldRank();
bool isMaster = (worldRank == 0);
af::setSeed(worldRank);
// Create dataset
const int nSamples = 10000 / worldSize;
const int nFeat = 10;
auto X = af::randu(nFeat, nSamples) + 1; // X elements in [1, 2]
auto Y = /* signal */ af::sum(af::pow(X, 3), 0).T() +
/* noise */ af::sin(2 * M_PI * af::randu(nSamples));
// Create Dataset to simplify the code for iterating over samples
TensorDataset data({X, Y});
const int inputIdx = 0, targetIdx = 1;
// Model definition - 2-layer Perceptron with ReLU activation
auto model = std::make_shared<Sequential>();
model->add(Linear(nFeat, 100));
model->add(ReLU());
model->add(Linear(100, 1));
// MSE loss
auto loss = MeanSquaredError();
// synchronize parameters of the model so that the parameters in each process
// is the same
fl::allReduceParameters(model);
// Add a hook to synchronize gradients of model parameters as they are
// computed
fl::distributeModuleGrads(model, 1.0 / worldSize);
// Optimizer definition
const float learningRate = 0.0001;
const float momentum = 0.9;
auto sgd = SGDOptimizer(model->params(), learningRate, momentum);
// Meter definition
AverageValueMeter meter;
// Start training
if (isMaster) {
std::cout << "[Multi-layer Perceptron] Started..." << std::endl;
}
const int nEpochs = 100;
for (int e = 1; e <= nEpochs; ++e) {
meter.reset();
for (auto& sample : data) {
sgd.zeroGrad();
// Forward propagation
auto result = model->forward(input(sample[inputIdx]));
// Calculate loss
auto l = loss(result, noGrad(sample[targetIdx]));
// Backward propagation
l.backward();
// Update parameters
sgd.step();
meter.add(l.scalar<float>());
}
auto mse = meter.value();
auto mseArr = af::array(1, &mse[0]);
fl::allReduce(mseArr);
if (isMaster) {
std::cout << "Epoch: " << e << " Mean Squared Error: "
<< mseArr.scalar<double>() / worldSize << std::endl;
}
}
if (isMaster) {
std::cout << "[Multi-layer Perceptron] Done!" << std::endl;
}
return 0;
}