GEMM convo engine backprop (kernel).

yorek · Apr 21, 2016 · 9b702ea · 9b702ea
1 parent f13ba49
commit 9b702ea
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 23 deletions.
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
@@ -4139,8 +4139,10 @@ template <class ElemType>
 void CPUMatrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
                                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
 {
+    size_t batchSize = GetNumCols();
+
 #pragma omp parallel for
-    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
     {
         for (size_t row = 0; row < mapOutSize; row++)
         {
@@ -4157,7 +4159,7 @@ void CPUMatrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOu
                     continue;
                 int dcol = runs(i0 + i, 0);
                 assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
-                output.Data()[(row * GetNumCols() + sample) * unrollCols + skip + i] = (*this)(colBase + dcol, sample);
+                output.Data()[(row * batchSize + sample) * unrollCols + skip + i] = (*this)(colBase + dcol, sample);
             }
         }
     }
@@ -4205,6 +4207,39 @@ void CPUMatrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapI
     }
 }
 
+template <class ElemType>
+void CPUMatrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
+{
+    size_t batchSize = GetNumCols();
+    size_t unrollCols = mapOutSize * batchSize;
+
+//#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
+    {
+        for (size_t row = 0; row < mapOutSize; row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                size_t idst = (skip + i) * unrollCols + row * batchSize + sample;
+                assert(idst < output.GetNumElements());
+                output.Data()[idst] = (*this)(colBase + dcol, sample);
+            }
+        }
+    }
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
 {

diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
@@ -352,6 +352,8 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
     void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const CPUMatrix<int>& mpRowCol,
                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
+    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
 
     void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
     void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,

diff --git a/Source/Math/ConvolutionEngine.cpp b/Source/Math/ConvolutionEngine.cpp
@@ -550,19 +550,9 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
             RuntimeError("GEMM convolution engine supports only CHW/cudnn layout.");
         if (IsGpu(m_deviceId))
             RuntimeError("GEMM convolution engine currently supports only CPU device.");
-
-        const auto& inT = m_geometry->InputShape();
-        const auto& kernT = m_geometry->KernelShape();
-        size_t dimCount = inT.GetRank();
-        if (kernT[dimCount - 1] != inT[dimCount - 1])
-        {
-            RuntimeError("GEMM convolution engine does not support this convolution configuration. "
-                         "It is possible to make GEMM engine work with this configuration by defining "
-                         "input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
-        }
     }
 
-    // The forward method consists of 2 parts:
+    // The forward method consists of 2 parts (using 2D convolution notation but applicable to ND as well):
     // 1. Unrolling convolution input into a matrix. Note that the matrix would be unrolled as if it were 
     //    in CHWN format. Using this format allows to perform convolution for the whole minibatch as a single GEMM
     //    which is not possible with NCHW format. Alternatively, NHWC format (used in legacy engine) could be used
@@ -647,6 +637,12 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
         // Each row of the unrolled output will be of size/layout HWK.
         // Otherwise we need to increase convolution input/kernel dimension to dimCount + 1 (this is checked in EnsureCompatible).
         assert(kernT[dimCount - 1] == inT[dimCount - 1]);
+        if (kernT[dimCount - 1] != inT[dimCount - 1])
+        {
+            RuntimeError("GEMM convolution engine does not support this convolution configuration. "
+                         "It is possible to make GEMM engine work with this configuration by defining "
+                         "input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
+        }
 
         size_t mapInCount  = kernT[dimCount - 1];
         size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
@@ -718,12 +714,80 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
         }
     }
 
+    // The backward kernel method consists of 3 parts (using 2D convolution notation but applicable to ND as well):
+    // 1. Transpose and reshape convolution output matrix (srcGrad) into [NW'H' x K] (column-major) layout.
+    //    This step is not needed if current minibatch size == 1.
+    // 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] (column-major) layaout.
+    // 3. Performing matrix multiplication of unrolled input with transposed output:
+    //    [NW'H' x WHC]^T * [NW'H' x K] = [WHC x K] (column major) - kernel gradients.
     void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
     {
-        UNUSED(srcGrad); UNUSED(in); UNUSED(kernelGrad); UNUSED(workspace);
-        //srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
-        //RuntimeError("Not yet implemented.");
-    }
+        size_t batchSize = srcGrad.GetNumCols();
+        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
+
+        const auto& inT = m_geometry->InputShape();
+        const auto& kernT = m_geometry->KernelShape();
+        const auto& outT = m_geometry->OutputShape();
+
+        size_t dimCount = inT.GetRank();
+        size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
+        size_t mapOutSize = outT.GetNumElements() / mapOutCount;
+
+        assert(kernT[dimCount - 1] == inT[dimCount - 1]);
+        if (kernT[dimCount - 1] != inT[dimCount - 1])
+        {
+            RuntimeError("GEMM convolution engine does not support this convolution configuration. "
+                         "It is possible to make GEMM engine work with this configuration by defining "
+                         "input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
+        }
+
+        size_t unrollRows = kernT.GetNumElements();
+        size_t unrollCols = mapOutSize * subBatchSize;
+
+        // Reserve space for:
+        // 1. Unrolled inputs.
+        // 2. Transposed source gradients (optional).
+        workspace.Resize(unrollCols, unrollRows + (subBatchSize > 1 ? mapOutCount : 0));
+
+        for (size_t start = 0; start < batchSize; start += subBatchSize)
+        {
+            size_t curBatchSize = min(subBatchSize, batchSize - start);
+            // 1. Transpose and reshape srcGrad.
+            auto srcGradSlice = srcGrad.ColumnSlice(start, curBatchSize);
+            if (curBatchSize > 1)
+            {
+                auto srcGradTranSlice = workspace.ColumnSlice(unrollRows, mapOutCount);
+                if (curBatchSize != subBatchSize)
+                {
+                    srcGradTranSlice.Reshape(mapOutCount * mapOutSize, subBatchSize);
+                    srcGradTranSlice = srcGradTranSlice.ColumnSlice(0, curBatchSize);
+                }
+                // Reshape to transposed shape - required by AssignTransposeOf.
+                srcGradTranSlice.Reshape(srcGradSlice.GetNumCols(), srcGradSlice.GetNumRows());
+                srcGradTranSlice.AssignTransposeOf(srcGradSlice);
+                srcGradSlice = srcGradTranSlice.ColumnSlice(0, srcGradTranSlice.GetNumCols());
+            }
+            srcGradSlice.Reshape(mapOutSize * curBatchSize, mapOutCount);
+
+            // 2. Unroll inputs.
+            auto inputSlice = in.ColumnSlice(start, curBatchSize);
+            auto unrolledInputSlice = workspace.ColumnSlice(0, unrollRows);
+            if (curBatchSize != subBatchSize)
+            {
+                unrolledInputSlice.Reshape(mapOutSize * unrollRows, subBatchSize);
+                unrolledInputSlice = unrolledInputSlice.ColumnSlice(0, curBatchSize);
+            }
+            unrolledInputSlice.Reshape(mapOutSize * curBatchSize, unrollRows);
+            unrolledInputSlice.SetValue(0);
+            inputSlice.UnrollConvolutionInputForKernelBackprop(mapOutSize, m_mpRowCol, *m_mpRowRun, *m_runs, unrolledInputSlice);
+
+            // cudnn layout uses row-major kernel weight matrix.
+            auto kernGrad = kernelGrad.ColumnSlice(0, kernelGrad.GetNumCols());
+            kernGrad.Reshape(kernelGrad.GetNumCols(), kernelGrad.GetNumRows());
+            // 3. Multiply.
+            Mat::MultiplyAndAdd(unrolledInputSlice, true, srcGradSlice, false, kernGrad);
+        }
+}
 
 public:
     static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry)

diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
@@ -4068,6 +4068,25 @@ void Matrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapInCo
                             NOT_IMPLEMENTED);
 }
 
+template <class ElemType>
+void Matrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                                               const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->UnrollConvolutionInputForKernelBackprop(mapOutSize, *(mpRowCol.m_CPUMatrix),
+                                                                                 *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
 template <class ElemType>
 void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
 {

diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
@@ -483,6 +483,8 @@ class MATH_API Matrix : public MatrixBase
                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
     void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const Matrix<int>& mpRowCol,
                                  const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
+    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
 
     void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
     void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,

diff --git a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@@ -93,7 +93,7 @@ std::vector<ConvolveGeometryPtr> GenerateConvTestConfigs()
     // For debugging.
     res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(3, 3, 1),
         TensorShape(3, 3, 1), TensorShape(2), TensorShape(1, 1, 1),
-        ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{false, false, false},
+        ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{true, true, false},
         TensorShape(0), TensorShape(0)));
 
     // Simple 3D convolution.
@@ -103,8 +103,8 @@ std::vector<ConvolveGeometryPtr> GenerateConvTestConfigs()
         TensorShape(0), TensorShape(0)));
     // Example of 3D convolution that can be represented with 3D tensors in reference engine
     // but requires 4D tensors in other engines.
-    res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(5, 5, 2, 1),
-        TensorShape(3, 3, 1, 1), TensorShape(2), TensorShape(1),
+    res.push_back(std::make_shared<ConvolveGeometry>(TensorShape(5, 5, 3, 1),
+        TensorShape(3, 3, 2, 1), TensorShape(2), TensorShape(1),
         ConvolveGeometry::BoolVec{true}, ConvolveGeometry::BoolVec{false},
         TensorShape(0), TensorShape(0)));
 
@@ -315,13 +315,15 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
     };
 
     int baseDeviceId = 0;
-    auto engKind = ConvolutionEngineKind::Reference;
-    for (int deviceId : {-1, 0})
+    for (const auto& engCfg : GetTestEngineConfigs())
     {
+        auto engKind = std::get<0>(engCfg);
+        auto deviceId = std::get<1>(engCfg);
+        auto maxTempMem = std::get<2>(engCfg);
         for (const auto& g : GenerateConvTestConfigs())
         {
             auto baseEng = ConvEng::Create(g, baseDeviceId, ImageLayoutKind::CHW, 0, PoolKind::None, ConvolutionEngineKind::CuDnn);
-            auto testEng = ConvEng::Create(g, deviceId, ImageLayoutKind::CHW, 0, PoolKind::None, engKind);
+            auto testEng = ConvEng::Create(g, deviceId, ImageLayoutKind::CHW, maxTempMem, PoolKind::None, engKind);
 
             size_t n = batchSizeG(rng);
             vec buf;