Skip to content

Commit

Permalink
optimize the sparse element wise times kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
jaurora committed Jun 10, 2016
1 parent e196db0 commit f6a2cc3
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 41 deletions.
1 change: 1 addition & 0 deletions Source/Math/GPUMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,7 @@ class SyncGuard
{
static bool DoSync()
{
//#undef NO_SYNC
#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
static bool do_sync = false;
#else
Expand Down
155 changes: 133 additions & 22 deletions Source/Math/GPUMatrixCUDAKernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2743,39 +2743,150 @@ __global__ void _sparseCSRElemMulDense(
}
}

//template <class ElemType>
//__global__ void _sparseCSCElemMulsparseCSC(
// const int m,
// const CUDA_LONG n,
// const ElemType* a_dVal,
// const int* a_dRow,
// const int* a_dCol,
// const ElemType* b_dVal,
// const int* b_dRow,
// const int* b_dCol,
// ElemType* c)
//{
// CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
// if (id >= n)
// return;
// int startA = a_dCol[id];
// int endA = a_dCol[id + 1];
//
// int startB = b_dCol[id];
// int endB = b_dCol[id + 1];
//
// while (startA < endA && startB < endB)
// {
// int aRow = a_dRow[startA];
// int bRow = b_dRow[startB];
//
// if (aRow == bRow)
// {
// c[IDX2C(aRow, id, m)] = a_dVal[startA] * b_dVal[startB];
// startA++;
// }
// else if (aRow < bRow) startA++;
// else startB++;
// }
//}


template <class ElemType>
__global__ void _sparseCSCElemMulsparseCSC(
const int m,
__global__ void _sparseCSCElemMulsparseCSC_Mark(
const int m,
const CUDA_LONG n,
ElemType* a_dVal,
int* a_dRow,
int* a_dCol,
const ElemType* b_dVal,
const int* b_dRow,
const int* b_dCol
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= n)
return;
int startA = a_dCol[id];
int endA = a_dCol[id + 1];

int startB = b_dCol[id];
int endB = b_dCol[id + 1];
int NZCounter = endA - startA;

while (startA < endA)
{
int aRow = a_dRow[startA];
if (startB >= endB)
{
a_dVal[startA] = (ElemType)0;
a_dRow[startA] = -1;
startA++;
NZCounter--;
continue;
}
int bRow = b_dRow[startB];
if (aRow == bRow)
{
a_dVal[startA] *= b_dVal[startB];
startA++;
}
else if (aRow > bRow)
startB++;
else
{
a_dVal[startA] = (ElemType)0;
a_dRow[startA] = -1;
startA++;
NZCounter--;
}
}
assert(NZCounter >= 0);
a_dCol[id+1] = NZCounter;
}

template <int BlockSize, class ElemType>
__global__ void _sparseCSCElemMulsparseCSC_Scan(const int segIdx, const CUDA_LONG n, GPUSPARSE_INDEX_TYPE* c, GPUSPARSE_INDEX_TYPE* aggregate)
{
typedef cub::BlockScan<int, BlockSize> BlockScanT;
__shared__ typename BlockScanT::TempStorage tmp;
int cur = 0;
if (BlockSize * segIdx + threadIdx.x < n)
{
cur = c[BlockSize*segIdx + threadIdx.x + 1];
}
int block_aggregate = 0;
BlockScanT(tmp).InclusiveSum(cur, cur, block_aggregate);
if (BlockSize * segIdx + threadIdx.x < n)
{
c[BlockSize*segIdx + threadIdx.x + 1] = cur + *aggregate;
}
if (threadIdx.x == 0) *aggregate += block_aggregate;
}

template <class ElemType>
__global__ void _sparseCSCElemMulsparseCSC_Update(
const CUDA_LONG n,
const ElemType* a_dVal,
const int* a_dRow,
const int* a_dCol,
const ElemType* b_dVal,
const int* b_dRow,
const int* b_dCol,
ElemType* c)
ElemType* in_dVal,
int* in_dRow,
int* in_dCol,
int* in_dCol_res,
ElemType* out_dVal,
int* out_dRow,
int* out_dCol
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= n)
return;
int startA = a_dCol[id];
int endA = a_dCol[id + 1];
out_dCol[id] = in_dCol[id];
if (id == n - 1) out_dCol[n] = in_dCol[n];

int startB = b_dCol[id];
int endB = b_dCol[id + 1];
int in_start = in_dCol_res[id];
int in_end = in_dCol_res[id + 1];

while (startA < endA && startB < endB)
{
int aRow = a_dRow[startA];
int bRow = b_dRow[startB];
int out_start = out_dCol[id];
int out_end = out_dCol[id + 1];

if (aRow == bRow)
while (in_start < in_end)
{
int rowIdx = in_dRow[in_start];
if (rowIdx >= 0)
{
c[IDX2C(aRow, id, m)] = a_dVal[startA] * b_dVal[startB];
startA++;
assert(out_start < out_end);
out_dRow[out_start] = in_dRow[in_start];
out_dVal[out_start] = in_dVal[in_start];
out_start++;
}
else if (aRow < bRow) startA++;
else startB++;
in_start++;
}
}

Expand Down
87 changes: 68 additions & 19 deletions Source/Math/GPUSparseMatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "CommonMatrix.h"
#include <iostream> // for cout/cerr
#include <assert.h>

#include <cub/cub.cuh>
typedef unsigned char byte;

#pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
Expand Down Expand Up @@ -2006,37 +2006,86 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<
return GPUSparseMatrix<ElemType>::ElementProductOf(b, a);
}

// use cun lib
// sparse x sparse = sparse
template <class ElemType>
GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
{
if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
NOT_IMPLEMENTED;
if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
NOT_IMPLEMENTED;

if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
LogicError("ElementProductOf: matrix dimensions mismatch");
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
LogicError("ElementProductOf: matrix dimensions mismatch");

b.PrepareDevice();
GPUMatrix<ElemType> c(b.GetNumRows(), b.GetNumCols(), b.GetComputeDeviceId());
b.PrepareDevice();
GPUSparseMatrix<ElemType> aCopy(a);

int m = (int)a.GetNumRows();
CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
SyncGuard syncGuard;
_sparseCSCElemMulsparseCSC<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(m, n, a.Data(), a.RowLocation(), a.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation(), c.Data());
int m = (int)a.GetNumRows();
CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
SyncGuard syncGuard;
_sparseCSCElemMulsparseCSC_Mark<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock >> >(m, n, aCopy.Data(), aCopy.RowLocation(), aCopy.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation());
//int* tmp = new int[100];
//cudaMemcpy(tmp, aCopy.ColLocation(), sizeof(int) * (aCopy.GetNumCols()+1), cudaMemcpyDeviceToHost);
// convert dense to sparse CSC
GPUSparseMatrix<ElemType> cs(c, MatrixFormat::matrixFormatSparseCSC);
//for (int i = 0; i <= n; i++)
// cout << tmp[i] << " " << endl;
// reduce the segemented column index to obtain the correct column index
////GPUMatrix<ElemType> ad(a.GetComputeDeviceId());
////a.CopyToDenseMatrix(ad);
//ElemType *carr = c.CopyToArray();
GPUSPARSE_INDEX_TYPE* dev_agg = 0;
CUDA_CALL(cudaMalloc((void**)&dev_agg, sizeof(GPUSPARSE_INDEX_TYPE)));
CUDA_CALL(cudaMemset((void *)dev_agg, 0, sizeof(GPUSPARSE_INDEX_TYPE)));
//for (int i = 0; i < m*n; i++) if (carr[i] > 0.0) fprintf(stderr, "%10.4f ", carr[i]);
for (int i = 0; i < blocksPerGrid; i++)
{
_sparseCSCElemMulsparseCSC_Scan<GridDim::maxThreadsPerBlock, ElemType> << <1, GridDim::maxThreadsPerBlock >> >(i, n, aCopy.ColLocation(), dev_agg);
}
//cudaMemcpy(tmp, aCopy.ColLocation(), sizeof(int) * (n + 1), cudaMemcpyDeviceToHost);
//for (int i = 0; i <= n; i++)
// cout << tmp[i] << " " << endl;
return cs;
int newNumNZ = 0;
CUDA_CALL(cudaMemcpy(&newNumNZ, dev_agg, sizeof(int), cudaMemcpyDeviceToHost));
GPUSparseMatrix<ElemType> c(a.GetNumRows(), a.GetNumCols(), newNumNZ, a.GetComputeDeviceId(), MatrixFormat::matrixFormatSparseCSC);
_sparseCSCElemMulsparseCSC_Update<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(n, aCopy.Data(), aCopy.RowLocation(), aCopy.ColLocation(), a.ColLocation(), c.Data(), c.RowLocation(), c.ColLocation());
CUDA_CALL(cudaFree(dev_agg));
return c;
}
//// sparse x sparse = sparse
//template <class ElemType>
//GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
//{
// if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
// NOT_IMPLEMENTED;
//
// if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
// LogicError("ElementProductOf: matrix dimensions mismatch");
//
// b.PrepareDevice();
// GPUMatrix<ElemType> c(b.GetNumRows(), b.GetNumCols(), b.GetComputeDeviceId());
//
// int m = (int)a.GetNumRows();
// CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
// int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
// SyncGuard syncGuard;
// _sparseCSCElemMulsparseCSC<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(m, n, a.Data(), a.RowLocation(), a.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation(), c.Data());
//
// // convert dense to sparse CSC
// GPUSparseMatrix<ElemType> cs(c, MatrixFormat::matrixFormatSparseCSC);
//
// ////GPUMatrix<ElemType> ad(a.GetComputeDeviceId());
// ////a.CopyToDenseMatrix(ad);
// //ElemType *carr = c.CopyToArray();
//
// //for (int i = 0; i < m*n; i++) if (carr[i] > 0.0) fprintf(stderr, "%10.4f ", carr[i]);
//
// return cs;
//}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
{
Expand Down

0 comments on commit f6a2cc3

Please sign in to comment.