Skip to content

Commit

Permalink
GPU version of Gather() and Scatter()
Browse files Browse the repository at this point in the history
  • Loading branch information
frankseide committed Mar 13, 2016
1 parent 2575da3 commit 0c62fb9
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 36 deletions.
9 changes: 5 additions & 4 deletions Source/Common/Include/Sequences.h
Original file line number Diff line number Diff line change
Expand Up @@ -1062,12 +1062,13 @@ static inline void MaskMissingColumnsTo(Matrix<ElemType>& matrixToMask, const MB
if (pMBLayout && pMBLayout->HasGaps(fr))
{
#if 0 // in the future we can use the tensor lib to implement this
const auto & maskMatrix = pMBLayout->GetColumnsValidMask<ElemType>();
auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
const auto & maskMatrix = pMBLayout->GetColumnsValidMask<ElemType>();
auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
#else
const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId());
maskMatrix.TransferToDeviceIfNotThere(matrixToMask.GetDeviceId(), /*ismoved=*/ false, /*emptyTransfer=*/ false, /*updatePreferredDevice=*/ false);
auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
matrixSliceToMask.MaskColumnsValue(maskSlice, val);
Expand Down
1 change: 1 addition & 0 deletions Source/ComputationNetworkLib/ReshapingNodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ template <class ElemType>
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
}
// the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
}

Expand Down
4 changes: 2 additions & 2 deletions Source/Math/CPUMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
Resize(a.GetNumRows(), m.GetNumCols());

auto& us = *this;
//#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
foreach_column(jOut, us)
{
auto jInF = m(0, jOut); // this is the column we need to get
Expand Down Expand Up @@ -750,7 +750,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
size_t jOut = (size_t)jOutF;
if (jOut >= GetNumCols())
InvalidArgument("DoGatherColumnsOf: Map out of bounds.");
ScaleAndAddColumn(beta, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha);
ScaleAndAddColumn(/*beta=*/(ElemType)1, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha);
}

return *this;
Expand Down
134 changes: 125 additions & 9 deletions Source/Math/GPUMatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,120 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf(const GPUMatrix<Elem
return *this;
}

template <class ElemType>
__global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType beta, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, size_t aCols, const ElemType alpha)
{
size_t i = threadIdx.x; // index into 'us' and 'a'
size_t jOut = blockIdx.x; // index into 'us' and 'm'

auto jInF = m[jOut * mStride]; // this is the column we need to get
if (jInF < 0) // negative index means gap
return;
size_t jIn = (size_t)jInF;
if (jIn >= aCols)
return; // actually a failure

const ElemType& ra = a[i + jIn * aStride];
ElemType& rus = us[i + jOut * usStride];

ElemType res = ra * alpha;
if (beta != 0)
res += rus * beta;
rus = res;
}

// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta
template <class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
{
if (m.GetNumRows() != 1) // index is 1-dimensional only
InvalidArgument("DoGatherColumnsOf: Map must be a row vector.");

if (beta)
VerifySize(a.GetNumRows(), m.GetNumCols());
else
Resize(a.GetNumRows(), m.GetNumCols());

if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
a.PrepareDevice();

SyncGuard syncGuard;
_doGatherColumnsOf<ElemType> << <GetNumCols(), GetNumRows(), 0, t_stream >> >(m_pArray, GetNumRows(), beta, m.m_pArray, 1, a.m_pArray, a.GetNumRows(), a.GetNumCols(), alpha);

return *this;
}

template <class ElemType>
__global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, const ElemType alpha)
{
size_t i = threadIdx.x; // index into 'a' and 'us'
size_t jIn = blockIdx.x; // index into 'a' and 'm'

auto jOutF = m[jIn * mStride]; // this is the column we copy/add into
if (jOutF < 0) // negative index means gap
return;
size_t jOut = (size_t)jOutF;
if (jOut >= usCols)
return; // actually a failure

const ElemType& ra = a[i + jIn * aStride];
ElemType& rus = us[i + jOut * usStride];

ElemType res = ra * alpha;
#if 0 // this is not the reason. Some stupid bad index.
rus += res;
#else
atomicAdd(&rus, res);
#endif
// Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter())
}

// little helper for debugging
template <class ElemType>
static void Peek(const GPUMatrix<ElemType>& m, const char* which)
{
size_t rows = m.GetNumRows();
size_t cols = m.GetNumCols();
ElemType buf[100] = { 0 };
size_t n = min(rows * cols, _countof(buf));
cudaMemcpy(buf, m.BufferPointer(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost);
UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
}

// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta
template <class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
{
if (m.GetNumRows() != 1) // index is 1-dimensional only
InvalidArgument("DoScatterColumnsOf: Map must be a row vector.");
if (m.GetNumCols() != a.GetNumCols())
InvalidArgument("DoScatterColumnsOf: Map must have width of input vector.");
if (a.GetNumRows() != GetNumRows())
InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector.");

if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
a.PrepareDevice();

auto& us = *this;
//Peek(us, "us"); Peek(m, "m"); Peek(a, "a");

// pre-scale with beta upfront
// Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
Scale(beta, us); // if beta is 0, then this will be a memset()

SyncGuard syncGuard;
_doScatterColumnsOf<ElemType> << <a.GetNumCols(), a.GetNumRows(), 0, t_stream >> >(m_pArray, GetNumRows(), GetNumCols(), m.m_pArray, 1, a.m_pArray, a.GetNumRows(), alpha);

return *this;
}

template <class ElemType>
void GPUMatrix<ElemType>::SetValue(const ElemType v)
{
if (IsEmpty())
LogicError("SetValue: Matrix is empty.");
return;

CUDA_LONG N = (CUDA_LONG) GetNumElements();

Expand Down Expand Up @@ -2979,14 +3088,16 @@ void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const GPUMatrix
/// <param name="a">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
/*static*/ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
{
if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
{
InvalidArgument("All matrices must be on the same GPU");
}
else
{
if (a.IsEmpty() && c.IsEmpty())
return;
a.PrepareDevice();
if (a.IsEmpty() || c.IsEmpty())
LogicError("ScaleAndAdd: one of the input matrices is empty.");
Expand Down Expand Up @@ -3088,14 +3199,16 @@ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>&
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
/*static*/ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
{
if (a.GetComputeDeviceId() != c.GetComputeDeviceId() || a.GetComputeDeviceId() != b.GetComputeDeviceId())
{
InvalidArgument("All matrices must be on the same GPU");
}
else
{
if (a.IsEmpty() && b.IsEmpty())
return;
a.PrepareDevice();
if (a.IsEmpty() || b.IsEmpty())
LogicError("ScaleAndAdd: one of the input matrices is empty.");
Expand Down Expand Up @@ -3321,8 +3434,14 @@ void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, cons
}
template <class ElemType>
void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
/*static*/ void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
{
if (alpha == 0) // if 0 then do not access the value, so that we can use this to multiply uninitialized matrices with beta=0
{
CUDA_CALL(cudaMemset(a.m_pArray, 0, a.m_numRows * a.m_numCols * sizeof(ElemType)));
return;
}
cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
if (sizeof(ElemType) == sizeof(float))
{
Expand All @@ -3341,7 +3460,7 @@ void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
}
template <class ElemType>
void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
/*static*/ void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
{
if (alpha.GetNumElements() != 1)
{
Expand All @@ -3366,11 +3485,8 @@ void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>&
}
template <class ElemType> // c = alpha * a
void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
/*static*/ void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
{
if (a.IsEmpty())
LogicError("Scale: Input matrix a is empty.");
c = a;
Scale(alpha, c);
}
Expand Down
3 changes: 3 additions & 0 deletions Source/Math/GPUMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
GPUMatrix<ElemType> Transpose() const;
GPUMatrix<ElemType>& AssignTransposeOf(const GPUMatrix<ElemType>& a);

GPUMatrix<ElemType>& DoGatherColumnsOf (ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha);
GPUMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha);

GPUMatrix<ElemType>& operator+=(const ElemType alpha);
GPUMatrix<ElemType> operator+(const ElemType alpha) const;
GPUMatrix<ElemType>& AssignSumOf(const ElemType alpha, const GPUMatrix<ElemType>& a);
Expand Down
36 changes: 20 additions & 16 deletions Source/Math/Matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
DISPATCH_MATRIX_ON_FLAG(&a,
this,
m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
NOT_IMPLEMENTED, //m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);

Expand All @@ -1099,7 +1099,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
DISPATCH_MATRIX_ON_FLAG(&a,
this,
m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
NOT_IMPLEMENTED, //m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);

Expand Down Expand Up @@ -1167,10 +1167,12 @@ template <class ElemType>
void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
{
if (GetNumCols() != columnsMask.GetNumCols())
RuntimeError("Matrix and column mask must have equal number of columns");
RuntimeError("MaskColumnsValue: Matrix and column mask must have equal number of columns.");

if (GetDeviceId() != columnsMask.GetDeviceId())
RuntimeError("Matrix and column mask must be on the same device");
if (GetCurrentMatrixLocation() == CPU && (columnsMask.GetCurrentMatrixLocation() == CPU || columnsMask.GetCurrentMatrixLocation() == BOTH))
; // OK
else if (GetDeviceId() != columnsMask.GetDeviceId() && columnsMask.GetCurrentMatrixLocation() != BOTH)
RuntimeError("MaskColumnsValue: Matrix and column mask must be on the same device.");

DISPATCH_MATRIX_ON_FLAG(this,
this,
Expand Down Expand Up @@ -3470,7 +3472,8 @@ int Matrix<ElemType>::GetDeviceId() const
// The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ.
// TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this.
template <class ElemType>
void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
template <class ElemType2>
void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType2>& b)
{
int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId();
if (deviceIdA == deviceIdB)
Expand Down Expand Up @@ -3541,21 +3544,21 @@ void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, con
}

template <class ElemType>
void Matrix<ElemType>::_transferToDevice(int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const
void Matrix<ElemType>::_transferToDevice(int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
{
int from_id = GetDeviceId();
if (to_id == from_id) // nothing to do
return;

if (OwnBuffer())
_transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer);
_transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
else
RuntimeError("Cannot move externally owned matrices to the preferred device.");
}

// this function performs data transfer and updates data location, but not the device that is stored with it
template <class ElemType>
void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const
void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
{
if (from_id < 0)
from_id = CPUDEVICE;
Expand Down Expand Up @@ -3606,7 +3609,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
m_GPUSparseMatrix->SetValue(*m_CPUSparseMatrix);
}

if (ismoved)
if (isBeingMoved)
{
delete m_CPUSparseMatrix;
m_CPUSparseMatrix = NULL;
Expand All @@ -3632,7 +3635,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
m_GPUSparseMatrix->CopyToCPUSparseMatrix(*m_CPUSparseMatrix);
}

if (ismoved)
if (isBeingMoved)
{
delete m_GPUSparseMatrix;
m_GPUSparseMatrix = NULL;
Expand Down Expand Up @@ -3666,7 +3669,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
{
m_GPUMatrix = new GPUMatrix<ElemType>(to_id);
}
if (ismoved)
if (isBeingMoved)
{
delete m_CPUMatrix;
m_CPUMatrix = NULL;
Expand Down Expand Up @@ -3698,7 +3701,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
m_CPUMatrix = new CPUMatrix<ElemType>();
}

if (ismoved)
if (isBeingMoved)
{
delete m_GPUMatrix;
m_GPUMatrix = NULL;
Expand All @@ -3718,9 +3721,9 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
}

template <class ElemType>
void Matrix<ElemType>::TransferFromDeviceToDevice(int from_id, int to_id, bool ismoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
void Matrix<ElemType>::TransferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
{
_transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer);
_transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
if (updatePreferredDevice)
m_preferredDeviceId = GetDeviceId();
}
Expand Down Expand Up @@ -5126,7 +5129,8 @@ template char* Matrix<char>::BufferPointer() const;
template int Matrix<char>::GetDeviceId() const;
template size_t Matrix<char>::GetNumElements() const;
template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
template void Matrix<char>::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const;
template void Matrix<char>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
template void Matrix<char>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
template size_t Matrix<char>::GetNumRows() const;
template size_t Matrix<char>::GetNumCols() const;
template void Matrix<char>::SetValue(const char);
Expand Down
Loading

0 comments on commit 0c62fb9

Please sign in to comment.