Skip to content

Commit

Permalink
GEMM convo engine backprop (kernel).
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexey Kamenev committed Apr 21, 2016
1 parent f13ba49 commit 9b702ea
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 23 deletions.
39 changes: 37 additions & 2 deletions Source/Math/CPUMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4139,8 +4139,10 @@ template <class ElemType>
void CPUMatrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
{
size_t batchSize = GetNumCols();

#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
{
for (size_t row = 0; row < mapOutSize; row++)
{
Expand All @@ -4157,7 +4159,7 @@ void CPUMatrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOu
continue;
int dcol = runs(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
output.Data()[(row * GetNumCols() + sample) * unrollCols + skip + i] = (*this)(colBase + dcol, sample);
output.Data()[(row * batchSize + sample) * unrollCols + skip + i] = (*this)(colBase + dcol, sample);
}
}
}
Expand Down Expand Up @@ -4205,6 +4207,39 @@ void CPUMatrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapI
}
}

template <class ElemType>
void CPUMatrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
{
size_t batchSize = GetNumCols();
size_t unrollCols = mapOutSize * batchSize;

//#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
{
for (size_t row = 0; row < mapOutSize; row++)
{
int colBase = mpRowCol(row, 0);
assert(0 <= colBase && colBase < GetNumRows());

int i0 = mpRowRun(row, 0);
int skip = runs(i0++, 0);
int size = runs(i0++, 0);
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs(imask + i, 0) == 0)
continue;
int dcol = runs(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
size_t idst = (skip + i) * unrollCols + row * batchSize + sample;
assert(idst < output.GetNumElements());
output.Data()[idst] = (*this)(colBase + dcol, sample);
}
}
}
}

template <class ElemType>
void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
{
Expand Down
2 changes: 2 additions & 0 deletions Source/Math/CPUMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,8 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const CPUMatrix<int>& mpRowCol,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;

void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
Expand Down
94 changes: 79 additions & 15 deletions Source/Math/ConvolutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,19 +550,9 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
RuntimeError("GEMM convolution engine supports only CHW/cudnn layout.");
if (IsGpu(m_deviceId))
RuntimeError("GEMM convolution engine currently supports only CPU device.");

const auto& inT = m_geometry->InputShape();
const auto& kernT = m_geometry->KernelShape();
size_t dimCount = inT.GetRank();
if (kernT[dimCount - 1] != inT[dimCount - 1])
{
RuntimeError("GEMM convolution engine does not support this convolution configuration. "
"It is possible to make GEMM engine work with this configuration by defining "
"input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
}
}

// The forward method consists of 2 parts:
// The forward method consists of 2 parts (using 2D convolution notation but applicable to ND as well):
// 1. Unrolling convolution input into a matrix. Note that the matrix would be unrolled as if it were
// in CHWN format. Using this format allows to perform convolution for the whole minibatch as a single GEMM
// which is not possible with NCHW format. Alternatively, NHWC format (used in legacy engine) could be used
Expand Down Expand Up @@ -647,6 +637,12 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
// Each row of the unrolled output will be of size/layout HWK.
// Otherwise we need to increase convolution input/kernel dimension to dimCount + 1 (this is checked in EnsureCompatible).
assert(kernT[dimCount - 1] == inT[dimCount - 1]);
if (kernT[dimCount - 1] != inT[dimCount - 1])
{
RuntimeError("GEMM convolution engine does not support this convolution configuration. "
"It is possible to make GEMM engine work with this configuration by defining "
"input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
}

size_t mapInCount = kernT[dimCount - 1];
size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
Expand Down Expand Up @@ -718,12 +714,80 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
}
}

// The backward kernel method consists of 3 parts (using 2D convolution notation but applicable to ND as well):
// 1. Transpose and reshape convolution output matrix (srcGrad) into [NW'H' x K] (column-major) layout.
// This step is not needed if current minibatch size == 1.
// 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] (column-major) layaout.
// 3. Performing matrix multiplication of unrolled input with transposed output:
// [NW'H' x WHC]^T * [NW'H' x K] = [WHC x K] (column major) - kernel gradients.
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
{
UNUSED(srcGrad); UNUSED(in); UNUSED(kernelGrad); UNUSED(workspace);
//srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
//RuntimeError("Not yet implemented.");
}
size_t batchSize = srcGrad.GetNumCols();
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);

const auto& inT = m_geometry->InputShape();
const auto& kernT = m_geometry->KernelShape();
const auto& outT = m_geometry->OutputShape();

size_t dimCount = inT.GetRank();
size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
size_t mapOutSize = outT.GetNumElements() / mapOutCount;

assert(kernT[dimCount - 1] == inT[dimCount - 1]);
if (kernT[dimCount - 1] != inT[dimCount - 1])
{
RuntimeError("GEMM convolution engine does not support this convolution configuration. "
"It is possible to make GEMM engine work with this configuration by defining "
"input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
}

size_t unrollRows = kernT.GetNumElements();
size_t unrollCols = mapOutSize * subBatchSize;

// Reserve space for:
// 1. Unrolled inputs.
// 2. Transposed source gradients (optional).
workspace.Resize(unrollCols, unrollRows + (subBatchSize > 1 ? mapOutCount : 0));

for (size_t start = 0; start < batchSize; start += subBatchSize)
{
size_t curBatchSize = min(subBatchSize, batchSize - start);
// 1. Transpose and reshape srcGrad.
auto srcGradSlice = srcGrad.ColumnSlice(start, curBatchSize);
if (curBatchSize > 1)
{
auto srcGradTranSlice = workspace.ColumnSlice(unrollRows, mapOutCount);
if (curBatchSize != subBatchSize)
{
srcGradTranSlice.Reshape(mapOutCount * mapOutSize, subBatchSize);
srcGradTranSlice = srcGradTranSlice.ColumnSlice(0, curBatchSize);
}
// Reshape to transposed shape - required by AssignTransposeOf.
srcGradTranSlice.Reshape(srcGradSlice.GetNumCols(), srcGradSlice.GetNumRows());
srcGradTranSlice.AssignTransposeOf(srcGradSlice);
srcGradSlice = srcGradTranSlice.ColumnSlice(0, srcGradTranSlice.GetNumCols());
}
srcGradSlice.Reshape(mapOutSize * curBatchSize, mapOutCount);

// 2. Unroll inputs.
auto inputSlice = in.ColumnSlice(start, curBatchSize);
auto unrolledInputSlice = workspace.ColumnSlice(0, unrollRows);
if (curBatchSize != subBatchSize)
{
unrolledInputSlice.Reshape(mapOutSize * unrollRows, subBatchSize);
unrolledInputSlice = unrolledInputSlice.ColumnSlice(0, curBatchSize);
}
unrolledInputSlice.Reshape(mapOutSize * curBatchSize, unrollRows);
unrolledInputSlice.SetValue(0);
inputSlice.UnrollConvolutionInputForKernelBackprop(mapOutSize, m_mpRowCol, *m_mpRowRun, *m_runs, unrolledInputSlice);

// cudnn layout uses row-major kernel weight matrix.
auto kernGrad = kernelGrad.ColumnSlice(0, kernelGrad.GetNumCols());
kernGrad.Reshape(kernelGrad.GetNumCols(), kernelGrad.GetNumRows());
// 3. Multiply.
Mat::MultiplyAndAdd(unrolledInputSlice, true, srcGradSlice, false, kernGrad);
}
}

public:
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry)
Expand Down
19 changes: 19 additions & 0 deletions Source/Math/Matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4068,6 +4068,25 @@ void Matrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapInCo
NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowRun.GetNumCols() == 1);
assert(runs.GetNumCols() == 1);

DecideAndMoveToRightDevice(*this, output);

DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->UnrollConvolutionInputForKernelBackprop(mapOutSize, *(mpRowCol.m_CPUMatrix),
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
{
Expand Down
2 changes: 2 additions & 0 deletions Source/Math/Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ class MATH_API Matrix : public MatrixBase
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const Matrix<int>& mpRowCol,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;

void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
Expand Down
14 changes: 8 additions & 6 deletions Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ std::vector<ConvolveGeometryPtr> GenerateConvTestConfigs()
// For debugging.
res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(3, 3, 1),
TensorShape(3, 3, 1), TensorShape(2), TensorShape(1, 1, 1),
ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{false, false, false},
ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{true, true, false},
TensorShape(0), TensorShape(0)));

// Simple 3D convolution.
Expand All @@ -103,8 +103,8 @@ std::vector<ConvolveGeometryPtr> GenerateConvTestConfigs()
TensorShape(0), TensorShape(0)));
// Example of 3D convolution that can be represented with 3D tensors in reference engine
// but requires 4D tensors in other engines.
res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(5, 5, 2, 1),
TensorShape(3, 3, 1, 1), TensorShape(2), TensorShape(1),
res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(5, 5, 3, 1),
TensorShape(3, 3, 2, 1), TensorShape(2), TensorShape(1),
ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{false},
TensorShape(0), TensorShape(0)));

Expand Down Expand Up @@ -315,13 +315,15 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
};

int baseDeviceId = 0;
auto engKind = ConvolutionEngineKind::Reference;
for (int deviceId : {-1, 0})
for (const auto& engCfg : GetTestEngineConfigs())
{
auto engKind = std::get<0>(engCfg);
auto deviceId = std::get<1>(engCfg);
auto maxTempMem = std::get<2>(engCfg);
for (const auto& g : GenerateConvTestConfigs())
{
auto baseEng = ConvEng::Create(g, baseDeviceId, ImageLayoutKind::CHW, 0, PoolKind::None, ConvolutionEngineKind::CuDnn);
auto testEng = ConvEng::Create(g, deviceId, ImageLayoutKind::CHW, 0, PoolKind::None, engKind);
auto testEng = ConvEng::Create(g, deviceId, ImageLayoutKind::CHW, maxTempMem, PoolKind::None, engKind);

size_t n = batchSizeG(rng);
vec buf;
Expand Down

0 comments on commit 9b702ea

Please sign in to comment.