Skip to content

Commit

Permalink
fixed issues with synchronizing streams when copy from gpu to cpu
Browse files Browse the repository at this point in the history
* by default, synchronize default_stream after resizeAndCopyFrom
* add sync in some places after resizeAndCopyFrom using other streams
  • Loading branch information
Haonan authored and reyoung committed Sep 14, 2016
1 parent 42a1179 commit 688eeef
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 24 deletions.
1 change: 0 additions & 1 deletion paddle/gserver/evaluators/CTCErrorEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ class CTCErrorEvaluator : public Evaluator {
Argument output, label;
output.resizeAndCopyFrom(arguments[0], false);
label.resizeAndCopyFrom(arguments[1], false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
CHECK(label.sequenceStartPositions);
CHECK(label.ids);
size_t numSequences = label.sequenceStartPositions->getSize() - 1;
Expand Down
6 changes: 1 addition & 5 deletions paddle/gserver/gradientmachines/MultiGradientMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() {
outArgs_.resize(outputGradArgs.size());
for (size_t i = 0; i < outputGradArgs.size(); i++) {
outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
multiMachine_->useGpu(),
HPPL_STREAM_DEFAULT);
}
if (multiMachine_->useGpu()) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
multiMachine_->useGpu());
}
gradientMachine_->setOutputGrad(outArgs_);
}
Expand Down
1 change: 1 addition & 0 deletions paddle/gserver/layers/CTCLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
} else {
forwardImp(getInput(0), getInput(1));
Expand Down
1 change: 1 addition & 0 deletions paddle/gserver/layers/CostLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
}
forwardImpIn(output, label, cost);
}
Expand Down
1 change: 1 addition & 0 deletions paddle/gserver/layers/SamplingIdLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class SamplingIdLayer : public Layer {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0]);
} else {
forwardImp(getInput(0));
Expand Down
2 changes: 0 additions & 2 deletions paddle/gserver/tests/LayerGradUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST);
Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) {
size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value);
Expand Down Expand Up @@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST);
Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) {
size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value);
Expand Down
1 change: 0 additions & 1 deletion paddle/gserver/tests/test_RecurrentLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
Argument& cpuInput = testCpu.dataLayer_->getOutput();
Argument& gpuInput = testGpu.dataLayer_->getOutput();
gpuInput.resizeAndCopyFrom(cpuInput, true);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);

const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
Expand Down
2 changes: 2 additions & 0 deletions paddle/math/Matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
if (!matrix) {
matrix = Matrix::create(height, width, trans, useGpu);
} else {
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width);
}
}
Expand All @@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
} else {
CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
dynamic_cast<GpuSparseMatrix*>(matrix.get()));
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width, nnz, valueType, format);
}
}
Expand Down
1 change: 1 addition & 0 deletions paddle/math/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
} else if ((!useGpu) && (!cpuVectorT_)) {
cpuVectorT_ = VectorT<T>::create(size, false);
} else {
CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
this->resize(size, useGpu);
}
}
Expand Down
27 changes: 16 additions & 11 deletions paddle/parameter/Argument.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,8 @@ namespace paddle {
static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
hl_stream_t stream) {
if (src) {
if (!dest) {
dest = src->clone(0, 0, useGpu);
} else {
dest->resize(src->getHeight(), src->getWidth());
}
Matrix::resizeOrCreate(dest, src->getHeight(),
src->getWidth(), false, useGpu);
dest->copyFrom(*src, stream);
} else {
dest.reset();
Expand Down Expand Up @@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
hl_stream_t stream = HPPL_STREAM_DEFAULT) {
if (src) {
CHECK_LE((size_t)startRow + copySize, src->getHeight());

int height = copySize;
int width = src->getWidth();
if (!dest) {
dest = src->clone(height, width, useGpu);
} else {
dest->resize(height, width);
}
Matrix::resizeOrCreate(dest, height, width, false, useGpu);
MatrixPtr submat = src->subMatrix(startRow, copySize);
if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
// copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
Expand Down Expand Up @@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
}
}

void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
}

void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
hl_stream_t stream) {
dataId = src.dataId;
Expand All @@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
resizeAndCopy(strs, src.strs, useGpu, stream);
}

int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu) {
int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
return size;
}

int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu,
hl_stream_t stream) {
Expand Down
11 changes: 7 additions & 4 deletions paddle/parameter/Argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,14 @@ struct Argument {
* return value: how many samples are copied
*/
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu,
hl_stream_t stream = HPPL_STREAM_DEFAULT);
int32_t copySize, bool useGpu, hl_stream_t stream);

void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
hl_stream_t stream = HPPL_STREAM_DEFAULT);
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu);

void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);

void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);

/*
@brief Concatenate several arguments into one and put the result into it.
Expand Down

0 comments on commit 688eeef

Please sign in to comment.