Some refactoring in preparation for separating out the 1bit gradient …

…aggregation implementation from the core CNTK sources
LightningDev · Jan 5, 2016 · c944295 · c944295
1 parent ff9a916
commit c944295
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 63 deletions.
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
@@ -163,6 +163,7 @@
     <ClInclude Include="CommonMatrix.h" />
     <ClInclude Include="ConvolutionEngine.h" />
     <ClInclude Include="CPUMatrix.h" />
+    <ClInclude Include="MatrixQuantizerImpl.h" />
     <ClInclude Include="TensorOps.h" />
     <ClInclude Include="TensorView.h" />
     <None Include="GPUWatcher.cu" />
@@ -200,8 +201,8 @@
       </PrecompiledHeader>
     </ClCompile>
     <ClCompile Include="CPUMatrix.cpp" />
-    <ClCompile Include="MatrixQuantizer.cpp" />
     <ClCompile Include="MatrixQuantizerCPU.cpp" />
+    <ClCompile Include="MatrixQuantizerImpl.cpp" />
     <ClCompile Include="NoGPU.cpp" />
     <ClCompile Include="Matrix.cpp" />
     <ClCompile Include="QuantizedMatrix.cpp" />

diff --git a/Source/Math/Math.vcxproj.filters b/Source/Math/Math.vcxproj.filters
@@ -41,7 +41,7 @@
     <ClCompile Include="QuantizedMatrix.cpp">
       <Filter>1bitSGD</Filter>
     </ClCompile>
-    <ClCompile Include="MatrixQuantizer.cpp">
+    <ClCompile Include="MatrixQuantizerImpl.cpp">
       <Filter>1bitSGD</Filter>
     </ClCompile>
   </ItemGroup>
@@ -100,6 +100,9 @@
     <ClInclude Include="MatrixQuantizer.h">
       <Filter>1bitSGD</Filter>
     </ClInclude>
+    <ClInclude Include="MatrixQuantizerImpl.h">
+      <Filter>1bitSGD</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="GPUMatrix.h">

diff --git a/Source/Math/MatrixQuantizerCPU.cpp b/Source/Math/MatrixQuantizerCPU.cpp
@@ -4,13 +4,13 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
-    MatrixQuantizerCPU<ElemType>::MatrixQuantizerCPU(size_t numRows, size_t numCols)
-        : MatrixQuantizer<ElemType>(numRows, numCols, CPUDEVICE)
+    MatrixQuantizerCPU<ElemType>::MatrixQuantizerCPU()
+        : MatrixQuantizerImpl<ElemType>(CPUDEVICE)
     {
     }
 
     template<class ElemType>
-    void  MatrixQuantizerCPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
+    void  MatrixQuantizerCPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit)
     {
         // The outQMatrix should be on the CPU
         // TODO: Support transferring the quantization output to a quantized matrix on the GPU 
@@ -23,7 +23,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // Verify that the different matrix parameters have matching dimensions
         assert((outQMatrix.GetNumRows() == nRow) && (outQMatrix.GetNumCols() == nCol));
-        assert((this->m_residual->GetNumRows() == nRow) && (this->m_residual->GetNumCols() == nCol));
+        assert((inResidual.GetNumRows() == nRow) && (inResidual.GetNumCols() == nCol));
+        assert((outResidual.GetNumRows() == nRow) && (outResidual.GetNumCols() == nCol));
 
         const size_t ldNbits = ValueQuantizer<ElemType>::ld (nBits);
     #ifdef QUANTUSEPPL
@@ -36,24 +37,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (zeroThresholdFor1Bit)
             {
                 // Explicit use of 'template' keyword is needed to compile with GCC
-                ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
+                ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
             }
             else
             {
                 // Explicit use of 'template' keyword is needed to compile with GCC
-                ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
+                ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
             }
 
             ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
             if (zeroThresholdFor1Bit)
             {
                 // Explicit use of 'template' keyword is needed to compile with GCC
-                q.template Quantize<true>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, qcol.bits, this->m_residual->BufferPointer());
+                q.template Quantize<true>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, qcol.bits, outResidual.BufferPointer());
             }
             else
             {
                 // Explicit use of 'template' keyword is needed to compile with GCC
-                q.template Quantize<false>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, qcol.bits, this->m_residual->BufferPointer());
+                q.template Quantize<false>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, qcol.bits, outResidual.BufferPointer());
             }
         }
     #ifdef QUANTUSEPPL

diff --git a/Source/Math/MatrixQuantizerCPU.h b/Source/Math/MatrixQuantizerCPU.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "MatrixQuantizer.h"
+#include "MatrixQuantizerImpl.h"
 #include "ColumnQuantizer.h"
 #include "QuantizedMatrix.h"
 #include "CPUMatrix.h"
@@ -19,16 +19,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     //see dbn::matrix quantizer
     template<class ElemType>
-    class MatrixQuantizerCPU final : public MatrixQuantizer<ElemType>
+    class MatrixQuantizerCPU final : public MatrixQuantizerImpl<ElemType>
     {
     public:    
-        MatrixQuantizerCPU(size_t numRows, size_t numCols);
+        MatrixQuantizerCPU();
 
         // Disallow copy construction and assignment
         MatrixQuantizerCPU(const MatrixQuantizerCPU&) = delete;
         MatrixQuantizerCPU& operator=(const MatrixQuantizerCPU&) = delete;
 
-        void QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit) override;
+        void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) override;
         void WaitQuantizeAsyncDone() override;
 
         void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;

diff --git a/Source/Math/MatrixQuantizerGPU.cu b/Source/Math/MatrixQuantizerGPU.cu
@@ -155,12 +155,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    QuantizedMatrix<ElemType>& MatrixQuantizerGPU<ElemType>::GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated)
+    QuantizedMatrix<ElemType>& MatrixQuantizerGPU<ElemType>::GetTempGPUQuantizedMatrix(size_t numRows, size_t numCols, size_t nBits, bool& newlyAllocated)
     {
         newlyAllocated = false;
 
         // Check if the existing one is good for our needs
-        if ((m_tempGPUQuantizedMatrix != nullptr) && (m_tempGPUQuantizedMatrix->GetNumBits() == nBits))
+        if ((m_tempGPUQuantizedMatrix != nullptr) && (m_tempGPUQuantizedMatrix->GetNumBits() == nBits) && (m_tempGPUQuantizedMatrix->GetNumRows() >= numRows) && (m_tempGPUQuantizedMatrix->GetNumCols() >= numCols))
         {
             return *m_tempGPUQuantizedMatrix;
         }
@@ -171,7 +171,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_tempGPUQuantizedMatrix = nullptr;
         }
 
-        m_tempGPUQuantizedMatrix = new QuantizedMatrix<ElemType>(this->m_residual->GetNumRows(), this->m_residual->GetNumCols(), nBits, (short)this->GetDeviceId());
+        m_tempGPUQuantizedMatrix = new QuantizedMatrix<ElemType>(numRows, numCols, nBits, (short)this->GetDeviceId());
         newlyAllocated = true;
 
         return *m_tempGPUQuantizedMatrix;
@@ -180,8 +180,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ///cpubuffer should be page-locked memory allocated, otherwise CUDA will not be efficient (hence we don't use STL)
     template<class ElemType>
-    MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(size_t numRows, size_t numCols, int deviceId, bool useDedicatedComputeStream, bool forceSync /*= false*/)
-    : MatrixQuantizer<ElemType>(numRows, numCols, deviceId), m_quantizeCompleteEvent(NULL), m_fetchCompleteEvent(NULL),
+    MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(int deviceId, bool useDedicatedComputeStream, bool forceSync /*= false*/)
+    : MatrixQuantizerImpl<ElemType>(deviceId), m_quantizeCompleteEvent(NULL), m_fetchCompleteEvent(NULL),
     m_tempMatrixZeroingCompleteEvent(NULL), m_assignCompleteEvent(NULL), m_forceSync(forceSync), m_tempGPUQuantizedMatrix(nullptr),
     m_quantizeOpIncludedFetch(false)
     {
@@ -224,11 +224,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    void MatrixQuantizerGPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
+    void MatrixQuantizerGPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit)
     {
         // Verify various input matrix parameter's dimensions
         assert((inMatrix.GetNumRows() == outQMatrix.GetNumRows()) && (inMatrix.GetNumCols() == outQMatrix.GetNumCols()));
-        assert((inMatrix.GetNumRows() == this->m_residual->GetNumRows()) && (inMatrix.GetNumCols() == this->m_residual->GetNumCols()));
+        assert((inMatrix.GetNumRows() == inResidual.GetNumRows()) && (inMatrix.GetNumCols() == inResidual.GetNumCols()));
+        assert((inMatrix.GetNumRows() == outResidual.GetNumRows()) && (inMatrix.GetNumCols() == outResidual.GetNumCols()));
 
         size_t nBits = outQMatrix.GetNumBits();
 
@@ -239,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         bool GPUMatrixNewlyAllocated = false;
-        QuantizedMatrix<ElemType>& outQMatrixGPU = (outQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(nBits, GPUMatrixNewlyAllocated) : outQMatrix;
+        QuantizedMatrix<ElemType>& outQMatrixGPU = (outQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(outQMatrix.GetNumRows(), outQMatrix.GetNumCols(), nBits, GPUMatrixNewlyAllocated) : outQMatrix;
 
         // If we newly allocated the target GPU matrix then the aysnc zeroing of the matrix is still in procgress on
         // the main compute stream. We must synchroniz with the mail compute stream in case the quantization
@@ -251,10 +252,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // Do the quantization on compute sstream and insert event into stream
-        _QuantizeMatrix<ElemType>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(),
+        _QuantizeMatrix<ElemType>(inMatrix.BufferPointer(), inResidual.BufferPointer(),
                                   inMatrix.GetNumRows(), inMatrix.GetNumCols(),
                                   outQMatrixGPU.GetArray(), nBits, GetComputeStream(),
-                                  this->m_residual->BufferPointer(), zeroThresholdFor1Bit);
+                                  outResidual.BufferPointer(), zeroThresholdFor1Bit);
 
         RecordQuantizeCompleteEvent(GetComputeStream());
 
@@ -296,7 +297,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         assert((inQMatrix.GetNumRows() == outMatrix.GetNumRows()) && (inQMatrix.GetNumCols() == outMatrix.GetNumCols()));                
 
         bool GPUMatrixNewlyAllocated = false;
-        QuantizedMatrix<ElemType>& inQMatrixGPU = (inQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(nBits, GPUMatrixNewlyAllocated) : inQMatrix;
+        QuantizedMatrix<ElemType>& inQMatrixGPU = (inQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(inQMatrix.GetNumRows(), inQMatrix.GetNumCols(), nBits, GPUMatrixNewlyAllocated) : inQMatrix;
 
         if (inQMatrix.GetDeviceId() == CPUDEVICE)
         {

diff --git a/Source/Math/MatrixQuantizerGPU.h b/Source/Math/MatrixQuantizerGPU.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "QuantizedMatrix.h"    // TODO: strangely, this must be included first, although it is the first thing MatrixQuantizer.h includes. Without, nvcc fails.
-#include "MatrixQuantizer.h"
+#include "MatrixQuantizerImpl.h"
 #include "ColumnQuantizer.h"
 #include "GPUMatrix.h"
 #ifndef CPUONLY
@@ -14,10 +14,10 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
-    class MatrixQuantizerGPU : public MatrixQuantizer<ElemType>
+    class MatrixQuantizerGPU : public MatrixQuantizerImpl<ElemType>
     {
     public:
-        MatrixQuantizerGPU(size_t numRows, size_t numCols, int deviceId, bool useDedicatedComputeStream, bool forceSync = false);
+        MatrixQuantizerGPU(int deviceId, bool useDedicatedComputeStream, bool forceSync = false);
         ~MatrixQuantizerGPU();
 
         // Disallow copy and move construction and assignment
@@ -26,15 +26,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         MatrixQuantizerGPU(MatrixQuantizerGPU&&) = delete;
         MatrixQuantizerGPU& operator=(MatrixQuantizerGPU&&) = delete;
 
-        void QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit) override;
+        void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) override;
         void WaitQuantizeAsyncDone() override;
 
         void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;
         void WaitUnquantizeAsyncDone() override;            
 
     private:        
         // Helper function to get a temporary intermediate matrix on the GPU to store quantization results
-        QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
+        QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t numRows, size_t numCols, size_t nBits, bool& newlyAllocated);
 
 #ifndef CPUONLY
         // Record a event to flag the completion of quantization/unquantization kernel on the compute stream

diff --git a/Source/Math/MatrixQuantizer.cpp → Source/Math/MatrixQuantizerImpl.cpp b/Source/Math/MatrixQuantizer.cpp → Source/Math/MatrixQuantizerImpl.cpp
@@ -1,59 +1,33 @@
 #include "stdafx.h"
 #include "Matrix.h"
-#include "MatrixQuantizer.h"
+#include "MatrixQuantizerImpl.h"
 #include "MatrixQuantizerCPU.h"
-#include "BestGpu.h"    // for CPUONLY
 #include "MatrixQuantizerGPU.h"
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
-    /*static*/ MatrixQuantizer<ElemType>*
-    MatrixQuantizer<ElemType>::CreateMatrixQuantizer(size_t numRows, size_t numCols, int deviceId, bool useAsync)
+    /*static*/ MatrixQuantizerImpl<ElemType>* MatrixQuantizerImpl<ElemType>::CreateMatrixQuantizerImpl(int deviceId, bool useAsync)
     {
         if (deviceId >= 0)
         {
 #ifndef CPUONLY
             bool useDedicatedComputeStream = useAsync;
-            return new MatrixQuantizerGPU<ElemType>(numRows, numCols, deviceId, useDedicatedComputeStream);
+            return new MatrixQuantizerGPU<ElemType>(deviceId, useDedicatedComputeStream);
 #else
             useAsync;
             RuntimeError("CreateMatrixQuantizer: attempted to use GPU while compiled without GPU support");
 #endif
         }
         else
         {
-            return new MatrixQuantizerCPU<ElemType>(numRows, numCols);
+            return new MatrixQuantizerCPU<ElemType>();
         }
     }
 
-    template<class ElemType>
-    MatrixQuantizer<ElemType>::MatrixQuantizer(size_t numRows, size_t numCols, int deviceId)
-    {
-        m_residual = new Matrix<ElemType>(numRows, numCols, deviceId, DENSE);
-    }
-
-    template<class ElemType>
-    MatrixQuantizer<ElemType>::~MatrixQuantizer()
-    {
-        if (nullptr != m_residual)
-        {
-            delete m_residual;
-            m_residual = nullptr;
-        }    
-    }
-
-    template<class ElemType>
-    void MatrixQuantizer<ElemType>::ResetResidue()
-    {
-        m_residual->SetValue(0.0);
-    }
-
-
-    template class MatrixQuantizer<float>;
-    template class MatrixQuantizer<double>;
+    template class MatrixQuantizerImpl<float>;
+    template class MatrixQuantizerImpl<double>;
 
-
     MatrixComputeStreamEvent* MatrixComputeStreamEvent::Create(int deviceId)
     {
         if (deviceId >= 0)

diff --git a/Source/Math/MatrixQuantizerImpl.h b/Source/Math/MatrixQuantizerImpl.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "ColumnQuantizer.h"
+#include "QuantizedMatrix.h"
+
+#ifdef    _WIN32
+#ifdef MATH_EXPORTS
+#define MATH_API __declspec(dllexport)
+#else
+#define MATH_API __declspec(dllimport)
+#endif
+#else    // no DLLs on Linux
+#define    MATH_API 
+#endif
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    template<class ElemType>
+    class MATH_API MatrixQuantizerImpl
+    {
+    public:
+        static MatrixQuantizerImpl<ElemType>* CreateMatrixQuantizerImpl(int deviceId, bool useAsync);
+        virtual ~MatrixQuantizerImpl() {}
+
+        // Disallow copy and move construction and assignment
+        MatrixQuantizerImpl(const MatrixQuantizerImpl&) = delete;
+        MatrixQuantizerImpl& operator=(const MatrixQuantizerImpl&) = delete;
+        MatrixQuantizerImpl(MatrixQuantizerImpl&&) = delete;
+        MatrixQuantizerImpl& operator=(MatrixQuantizerImpl&&) = delete;
+
+        virtual void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) = 0;
+        virtual void WaitQuantizeAsyncDone() = 0;
+
+        virtual void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) = 0;
+        virtual void WaitUnquantizeAsyncDone() = 0;
+
+    protected:
+        MatrixQuantizerImpl(int deviceId) : m_deviceId(deviceId) {}
+
+        int GetDeviceId() const
+        {
+            return m_deviceId;
+        }
+
+    private:
+        int m_deviceId;
+    };
+
+    // This type records and synchronizes events on the main 
+    // matrix computation work stream
+    class MATH_API MatrixComputeStreamEvent
+    {
+    public:
+        static MatrixComputeStreamEvent* Create(int deviceId);
+        virtual ~MatrixComputeStreamEvent();
+
+        virtual void SynchronizeEvent();
+
+        template <typename ElemType>
+        void SynchronizeQuantizationComputeStreamWithEvent();
+
+    protected:
+        MatrixComputeStreamEvent(int deviceId);
+
+    protected:
+        int m_deviceId;
+    };
+
+}}}
diff --git a/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp b/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
@@ -256,7 +256,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
         std::unique_ptr<MemAllocator> allocator(deviceId == CPUDEVICE ? nullptr : new CUDAPageLockedMemAllocator(deviceId));
 
         Matrix<ElemType> inMatrix(numRows, numCols, deviceId);
-        std::unique_ptr<MatrixQuantizer<ElemType>> quantizer(MatrixQuantizer<ElemType>::CreateMatrixQuantizer(numRows, numCols, deviceId, false /*useAsync*/));
+        std::unique_ptr<MatrixQuantizer<ElemType>> quantizer(new MatrixQuantizer<ElemType>(numRows, numCols, deviceId, false /*useAsync*/));
 
         // Verify that the initial residue is comprised of all zeros
         verifyAllZerosFunc(quantizer->GetResidualMatrix());