Skip to content

Commit

Permalink
Some refactoring in preparation for separating out the 1bit gradient …
Browse files Browse the repository at this point in the history
…aggregation implementation from the core CNTK sources
  • Loading branch information
amitaga committed Jan 5, 2016
1 parent ff9a916 commit c944295
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 63 deletions.
3 changes: 2 additions & 1 deletion Source/Math/Math.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="ConvolutionEngine.h" />
<ClInclude Include="CPUMatrix.h" />
<ClInclude Include="MatrixQuantizerImpl.h" />
<ClInclude Include="TensorOps.h" />
<ClInclude Include="TensorView.h" />
<None Include="GPUWatcher.cu" />
Expand Down Expand Up @@ -200,8 +201,8 @@
</PrecompiledHeader>
</ClCompile>
<ClCompile Include="CPUMatrix.cpp" />
<ClCompile Include="MatrixQuantizer.cpp" />
<ClCompile Include="MatrixQuantizerCPU.cpp" />
<ClCompile Include="MatrixQuantizerImpl.cpp" />
<ClCompile Include="NoGPU.cpp" />
<ClCompile Include="Matrix.cpp" />
<ClCompile Include="QuantizedMatrix.cpp" />
Expand Down
5 changes: 4 additions & 1 deletion Source/Math/Math.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
<ClCompile Include="QuantizedMatrix.cpp">
<Filter>1bitSGD</Filter>
</ClCompile>
<ClCompile Include="MatrixQuantizer.cpp">
<ClCompile Include="MatrixQuantizerImpl.cpp">
<Filter>1bitSGD</Filter>
</ClCompile>
</ItemGroup>
Expand Down Expand Up @@ -100,6 +100,9 @@
<ClInclude Include="MatrixQuantizer.h">
<Filter>1bitSGD</Filter>
</ClInclude>
<ClInclude Include="MatrixQuantizerImpl.h">
<Filter>1bitSGD</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">
Expand Down
17 changes: 9 additions & 8 deletions Source/Math/MatrixQuantizerCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
namespace Microsoft { namespace MSR { namespace CNTK {

template<class ElemType>
MatrixQuantizerCPU<ElemType>::MatrixQuantizerCPU(size_t numRows, size_t numCols)
: MatrixQuantizer<ElemType>(numRows, numCols, CPUDEVICE)
MatrixQuantizerCPU<ElemType>::MatrixQuantizerCPU()
: MatrixQuantizerImpl<ElemType>(CPUDEVICE)
{
}

template<class ElemType>
void MatrixQuantizerCPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
void MatrixQuantizerCPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit)
{
// The outQMatrix should be on the CPU
// TODO: Support transferring the quantization output to a quantized matrix on the GPU
Expand All @@ -23,7 +23,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

// Verify that the different matrix parameters have matching dimensions
assert((outQMatrix.GetNumRows() == nRow) && (outQMatrix.GetNumCols() == nCol));
assert((this->m_residual->GetNumRows() == nRow) && (this->m_residual->GetNumCols() == nCol));
assert((inResidual.GetNumRows() == nRow) && (inResidual.GetNumCols() == nCol));
assert((outResidual.GetNumRows() == nRow) && (outResidual.GetNumCols() == nCol));

const size_t ldNbits = ValueQuantizer<ElemType>::ld (nBits);
#ifdef QUANTUSEPPL
Expand All @@ -36,24 +37,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (zeroThresholdFor1Bit)
{
// Explicit use of 'template' keyword is needed to compile with GCC
ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
}
else
{
// Explicit use of 'template' keyword is needed to compile with GCC
ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, nBits, qcol.lower, qcol.upper);
}

ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
if (zeroThresholdFor1Bit)
{
// Explicit use of 'template' keyword is needed to compile with GCC
q.template Quantize<true>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, qcol.bits, this->m_residual->BufferPointer());
q.template Quantize<true>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, qcol.bits, outResidual.BufferPointer());
}
else
{
// Explicit use of 'template' keyword is needed to compile with GCC
q.template Quantize<false>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(), (long)nRow, j, qcol.bits, this->m_residual->BufferPointer());
q.template Quantize<false>(inMatrix.BufferPointer(), inResidual.BufferPointer(), (long)nRow, j, qcol.bits, outResidual.BufferPointer());
}
}
#ifdef QUANTUSEPPL
Expand Down
8 changes: 4 additions & 4 deletions Source/Math/MatrixQuantizerCPU.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include "MatrixQuantizer.h"
#include "MatrixQuantizerImpl.h"
#include "ColumnQuantizer.h"
#include "QuantizedMatrix.h"
#include "CPUMatrix.h"
Expand All @@ -19,16 +19,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {

//see dbn::matrix quantizer
template<class ElemType>
class MatrixQuantizerCPU final : public MatrixQuantizer<ElemType>
class MatrixQuantizerCPU final : public MatrixQuantizerImpl<ElemType>
{
public:
MatrixQuantizerCPU(size_t numRows, size_t numCols);
MatrixQuantizerCPU();

// Disallow copy construction and assignment
MatrixQuantizerCPU(const MatrixQuantizerCPU&) = delete;
MatrixQuantizerCPU& operator=(const MatrixQuantizerCPU&) = delete;

void QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit) override;
void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) override;
void WaitQuantizeAsyncDone() override;

void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;
Expand Down
23 changes: 12 additions & 11 deletions Source/Math/MatrixQuantizerGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}

template<class ElemType>
QuantizedMatrix<ElemType>& MatrixQuantizerGPU<ElemType>::GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated)
QuantizedMatrix<ElemType>& MatrixQuantizerGPU<ElemType>::GetTempGPUQuantizedMatrix(size_t numRows, size_t numCols, size_t nBits, bool& newlyAllocated)
{
newlyAllocated = false;

// Check if the existing one is good for our needs
if ((m_tempGPUQuantizedMatrix != nullptr) && (m_tempGPUQuantizedMatrix->GetNumBits() == nBits))
if ((m_tempGPUQuantizedMatrix != nullptr) && (m_tempGPUQuantizedMatrix->GetNumBits() == nBits) && (m_tempGPUQuantizedMatrix->GetNumRows() >= numRows) && (m_tempGPUQuantizedMatrix->GetNumCols() >= numCols))
{
return *m_tempGPUQuantizedMatrix;
}
Expand All @@ -171,7 +171,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_tempGPUQuantizedMatrix = nullptr;
}

m_tempGPUQuantizedMatrix = new QuantizedMatrix<ElemType>(this->m_residual->GetNumRows(), this->m_residual->GetNumCols(), nBits, (short)this->GetDeviceId());
m_tempGPUQuantizedMatrix = new QuantizedMatrix<ElemType>(numRows, numCols, nBits, (short)this->GetDeviceId());
newlyAllocated = true;

return *m_tempGPUQuantizedMatrix;
Expand All @@ -180,8 +180,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///cpubuffer should be page-locked memory allocated, otherwise CUDA will not be efficient (hence we don't use STL)
template<class ElemType>
MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(size_t numRows, size_t numCols, int deviceId, bool useDedicatedComputeStream, bool forceSync /*= false*/)
: MatrixQuantizer<ElemType>(numRows, numCols, deviceId), m_quantizeCompleteEvent(NULL), m_fetchCompleteEvent(NULL),
MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(int deviceId, bool useDedicatedComputeStream, bool forceSync /*= false*/)
: MatrixQuantizerImpl<ElemType>(deviceId), m_quantizeCompleteEvent(NULL), m_fetchCompleteEvent(NULL),
m_tempMatrixZeroingCompleteEvent(NULL), m_assignCompleteEvent(NULL), m_forceSync(forceSync), m_tempGPUQuantizedMatrix(nullptr),
m_quantizeOpIncludedFetch(false)
{
Expand Down Expand Up @@ -224,11 +224,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}

template<class ElemType>
void MatrixQuantizerGPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
void MatrixQuantizerGPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit)
{
// Verify various input matrix parameter's dimensions
assert((inMatrix.GetNumRows() == outQMatrix.GetNumRows()) && (inMatrix.GetNumCols() == outQMatrix.GetNumCols()));
assert((inMatrix.GetNumRows() == this->m_residual->GetNumRows()) && (inMatrix.GetNumCols() == this->m_residual->GetNumCols()));
assert((inMatrix.GetNumRows() == inResidual.GetNumRows()) && (inMatrix.GetNumCols() == inResidual.GetNumCols()));
assert((inMatrix.GetNumRows() == outResidual.GetNumRows()) && (inMatrix.GetNumCols() == outResidual.GetNumCols()));

size_t nBits = outQMatrix.GetNumBits();

Expand All @@ -239,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}

bool GPUMatrixNewlyAllocated = false;
QuantizedMatrix<ElemType>& outQMatrixGPU = (outQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(nBits, GPUMatrixNewlyAllocated) : outQMatrix;
QuantizedMatrix<ElemType>& outQMatrixGPU = (outQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(outQMatrix.GetNumRows(), outQMatrix.GetNumCols(), nBits, GPUMatrixNewlyAllocated) : outQMatrix;

// If we newly allocated the target GPU matrix then the aysnc zeroing of the matrix is still in procgress on
// the main compute stream. We must synchroniz with the mail compute stream in case the quantization
Expand All @@ -251,10 +252,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}

// Do the quantization on compute sstream and insert event into stream
_QuantizeMatrix<ElemType>(inMatrix.BufferPointer(), this->m_residual->BufferPointer(),
_QuantizeMatrix<ElemType>(inMatrix.BufferPointer(), inResidual.BufferPointer(),
inMatrix.GetNumRows(), inMatrix.GetNumCols(),
outQMatrixGPU.GetArray(), nBits, GetComputeStream(),
this->m_residual->BufferPointer(), zeroThresholdFor1Bit);
outResidual.BufferPointer(), zeroThresholdFor1Bit);

RecordQuantizeCompleteEvent(GetComputeStream());

Expand Down Expand Up @@ -296,7 +297,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert((inQMatrix.GetNumRows() == outMatrix.GetNumRows()) && (inQMatrix.GetNumCols() == outMatrix.GetNumCols()));

bool GPUMatrixNewlyAllocated = false;
QuantizedMatrix<ElemType>& inQMatrixGPU = (inQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(nBits, GPUMatrixNewlyAllocated) : inQMatrix;
QuantizedMatrix<ElemType>& inQMatrixGPU = (inQMatrix.GetDeviceId() == CPUDEVICE) ? GetTempGPUQuantizedMatrix(inQMatrix.GetNumRows(), inQMatrix.GetNumCols(), nBits, GPUMatrixNewlyAllocated) : inQMatrix;

if (inQMatrix.GetDeviceId() == CPUDEVICE)
{
Expand Down
10 changes: 5 additions & 5 deletions Source/Math/MatrixQuantizerGPU.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "QuantizedMatrix.h" // TODO: strangely, this must be included first, although it is the first thing MatrixQuantizer.h includes. Without, nvcc fails.
#include "MatrixQuantizer.h"
#include "MatrixQuantizerImpl.h"
#include "ColumnQuantizer.h"
#include "GPUMatrix.h"
#ifndef CPUONLY
Expand All @@ -14,10 +14,10 @@
namespace Microsoft { namespace MSR { namespace CNTK {

template<class ElemType>
class MatrixQuantizerGPU : public MatrixQuantizer<ElemType>
class MatrixQuantizerGPU : public MatrixQuantizerImpl<ElemType>
{
public:
MatrixQuantizerGPU(size_t numRows, size_t numCols, int deviceId, bool useDedicatedComputeStream, bool forceSync = false);
MatrixQuantizerGPU(int deviceId, bool useDedicatedComputeStream, bool forceSync = false);
~MatrixQuantizerGPU();

// Disallow copy and move construction and assignment
Expand All @@ -26,15 +26,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
MatrixQuantizerGPU(MatrixQuantizerGPU&&) = delete;
MatrixQuantizerGPU& operator=(MatrixQuantizerGPU&&) = delete;

void QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit) override;
void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) override;
void WaitQuantizeAsyncDone() override;

void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;
void WaitUnquantizeAsyncDone() override;

private:
// Helper function to get a temporary intermediate matrix on the GPU to store quantization results
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t numRows, size_t numCols, size_t nBits, bool& newlyAllocated);

#ifndef CPUONLY
// Record a event to flag the completion of quantization/unquantization kernel on the compute stream
Expand Down
Original file line number Diff line number Diff line change
@@ -1,59 +1,33 @@
#include "stdafx.h"
#include "Matrix.h"
#include "MatrixQuantizer.h"
#include "MatrixQuantizerImpl.h"
#include "MatrixQuantizerCPU.h"
#include "BestGpu.h" // for CPUONLY
#include "MatrixQuantizerGPU.h"

namespace Microsoft { namespace MSR { namespace CNTK {

template<class ElemType>
/*static*/ MatrixQuantizer<ElemType>*
MatrixQuantizer<ElemType>::CreateMatrixQuantizer(size_t numRows, size_t numCols, int deviceId, bool useAsync)
/*static*/ MatrixQuantizerImpl<ElemType>* MatrixQuantizerImpl<ElemType>::CreateMatrixQuantizerImpl(int deviceId, bool useAsync)
{
if (deviceId >= 0)
{
#ifndef CPUONLY
bool useDedicatedComputeStream = useAsync;
return new MatrixQuantizerGPU<ElemType>(numRows, numCols, deviceId, useDedicatedComputeStream);
return new MatrixQuantizerGPU<ElemType>(deviceId, useDedicatedComputeStream);
#else
useAsync;
RuntimeError("CreateMatrixQuantizer: attempted to use GPU while compiled without GPU support");
#endif
}
else
{
return new MatrixQuantizerCPU<ElemType>(numRows, numCols);
return new MatrixQuantizerCPU<ElemType>();
}
}

template<class ElemType>
MatrixQuantizer<ElemType>::MatrixQuantizer(size_t numRows, size_t numCols, int deviceId)
{
m_residual = new Matrix<ElemType>(numRows, numCols, deviceId, DENSE);
}

template<class ElemType>
MatrixQuantizer<ElemType>::~MatrixQuantizer()
{
if (nullptr != m_residual)
{
delete m_residual;
m_residual = nullptr;
}
}

template<class ElemType>
void MatrixQuantizer<ElemType>::ResetResidue()
{
m_residual->SetValue(0.0);
}


template class MatrixQuantizer<float>;
template class MatrixQuantizer<double>;
template class MatrixQuantizerImpl<float>;
template class MatrixQuantizerImpl<double>;


MatrixComputeStreamEvent* MatrixComputeStreamEvent::Create(int deviceId)
{
if (deviceId >= 0)
Expand Down
69 changes: 69 additions & 0 deletions Source/Math/MatrixQuantizerImpl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#pragma once

#include "ColumnQuantizer.h"
#include "QuantizedMatrix.h"

#ifdef _WIN32
#ifdef MATH_EXPORTS
#define MATH_API __declspec(dllexport)
#else
#define MATH_API __declspec(dllimport)
#endif
#else // no DLLs on Linux
#define MATH_API
#endif

namespace Microsoft { namespace MSR { namespace CNTK {

template<class ElemType>
class MATH_API MatrixQuantizerImpl
{
public:
static MatrixQuantizerImpl<ElemType>* CreateMatrixQuantizerImpl(int deviceId, bool useAsync);
virtual ~MatrixQuantizerImpl() {}

// Disallow copy and move construction and assignment
MatrixQuantizerImpl(const MatrixQuantizerImpl&) = delete;
MatrixQuantizerImpl& operator=(const MatrixQuantizerImpl&) = delete;
MatrixQuantizerImpl(MatrixQuantizerImpl&&) = delete;
MatrixQuantizerImpl& operator=(MatrixQuantizerImpl&&) = delete;

virtual void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit) = 0;
virtual void WaitQuantizeAsyncDone() = 0;

virtual void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) = 0;
virtual void WaitUnquantizeAsyncDone() = 0;

protected:
MatrixQuantizerImpl(int deviceId) : m_deviceId(deviceId) {}

int GetDeviceId() const
{
return m_deviceId;
}

private:
int m_deviceId;
};

// This type records and synchronizes events on the main
// matrix computation work stream
class MATH_API MatrixComputeStreamEvent
{
public:
static MatrixComputeStreamEvent* Create(int deviceId);
virtual ~MatrixComputeStreamEvent();

virtual void SynchronizeEvent();

template <typename ElemType>
void SynchronizeQuantizationComputeStreamWithEvent();

protected:
MatrixComputeStreamEvent(int deviceId);

protected:
int m_deviceId;
};

}}}
2 changes: 1 addition & 1 deletion Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
std::unique_ptr<MemAllocator> allocator(deviceId == CPUDEVICE ? nullptr : new CUDAPageLockedMemAllocator(deviceId));

Matrix<ElemType> inMatrix(numRows, numCols, deviceId);
std::unique_ptr<MatrixQuantizer<ElemType>> quantizer(MatrixQuantizer<ElemType>::CreateMatrixQuantizer(numRows, numCols, deviceId, false /*useAsync*/));
std::unique_ptr<MatrixQuantizer<ElemType>> quantizer(new MatrixQuantizer<ElemType>(numRows, numCols, deviceId, false /*useAsync*/));

// Verify that the initial residue is comprised of all zeros
verifyAllZerosFunc(quantizer->GetResidualMatrix());
Expand Down

0 comments on commit c944295

Please sign in to comment.