optimize the sparse element wise times kernels

todda · Jun 10, 2016 · f6a2cc3 · f6a2cc3
1 parent e196db0
commit f6a2cc3
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 41 deletions.
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
@@ -630,6 +630,7 @@ class SyncGuard
 {
     static bool DoSync()
     {
+//#undef NO_SYNC 
 #ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
         static bool do_sync = false;
 #else

diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -2743,39 +2743,150 @@ __global__ void _sparseCSRElemMulDense(
     }
 }
 
+//template <class ElemType>
+//__global__ void _sparseCSCElemMulsparseCSC(
+//    const int m,
+//    const CUDA_LONG n,
+//    const ElemType* a_dVal,
+//    const int* a_dRow,
+//    const int* a_dCol,
+//    const ElemType* b_dVal,
+//    const int* b_dRow,
+//    const int* b_dCol,
+//    ElemType* c)
+//{
+//    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+//    if (id >= n)
+//        return;
+//    int startA = a_dCol[id];
+//    int endA = a_dCol[id + 1];
+//
+//    int startB = b_dCol[id];
+//    int endB = b_dCol[id + 1];
+//
+//    while (startA < endA && startB < endB)
+//    {
+//        int aRow = a_dRow[startA];
+//        int bRow = b_dRow[startB];
+//
+//        if (aRow == bRow)
+//        {
+//            c[IDX2C(aRow, id, m)] = a_dVal[startA] * b_dVal[startB];
+//            startA++;
+//        }
+//        else if (aRow < bRow) startA++;
+//        else startB++;     
+//    }
+//}
+
+
 template <class ElemType>
-__global__ void _sparseCSCElemMulsparseCSC(
-    const int m,
+__global__ void _sparseCSCElemMulsparseCSC_Mark(
+	const int m,
+	const CUDA_LONG n,
+	ElemType* a_dVal,
+	int* a_dRow,
+	int* a_dCol,
+	const ElemType* b_dVal,
+	const int* b_dRow,
+	const int* b_dCol
+	)
+{
+	CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= n)
+		return;
+	int startA = a_dCol[id];
+	int endA = a_dCol[id + 1];
+
+	int startB = b_dCol[id];
+	int endB = b_dCol[id + 1];
+	int NZCounter = endA - startA;
+
+	while (startA < endA)
+	{
+		int aRow = a_dRow[startA];
+		if (startB >= endB)
+		{
+			a_dVal[startA] = (ElemType)0;
+			a_dRow[startA] = -1;
+			startA++;
+			NZCounter--;
+			continue;
+		}
+		int bRow = b_dRow[startB];
+		if (aRow == bRow)
+		{
+			a_dVal[startA] *= b_dVal[startB];
+			startA++;
+		}
+		else if (aRow > bRow)
+			startB++;
+		else
+		{
+			a_dVal[startA] = (ElemType)0;
+			a_dRow[startA] = -1;
+			startA++;
+			NZCounter--;
+		}
+	}
+    assert(NZCounter >= 0);
+	a_dCol[id+1] = NZCounter;
+}
+
+template <int BlockSize, class ElemType>
+__global__ void _sparseCSCElemMulsparseCSC_Scan(const int segIdx, const CUDA_LONG n, GPUSPARSE_INDEX_TYPE* c, GPUSPARSE_INDEX_TYPE* aggregate)
+{
+    typedef cub::BlockScan<int, BlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage tmp;
+    int cur = 0;
+    if (BlockSize * segIdx + threadIdx.x < n)
+    {
+        cur = c[BlockSize*segIdx + threadIdx.x + 1];
+    }
+    int block_aggregate = 0;
+    BlockScanT(tmp).InclusiveSum(cur, cur, block_aggregate);
+    if (BlockSize * segIdx + threadIdx.x < n)
+    {
+        c[BlockSize*segIdx + threadIdx.x + 1] = cur + *aggregate;
+    }
+    if (threadIdx.x == 0) *aggregate += block_aggregate;
+}
+
+template <class ElemType>
+__global__ void _sparseCSCElemMulsparseCSC_Update(
     const CUDA_LONG n,
-    const ElemType* a_dVal,
-    const int* a_dRow,
-    const int* a_dCol,
-    const ElemType* b_dVal,
-    const int* b_dRow,
-    const int* b_dCol,
-    ElemType* c)
+    ElemType* in_dVal,
+    int* in_dRow,
+    int* in_dCol,
+    int* in_dCol_res,
+    ElemType* out_dVal,
+    int* out_dRow,
+    int* out_dCol
+    )
 {
     CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
     if (id >= n)
         return;
-    int startA = a_dCol[id];
-    int endA = a_dCol[id + 1];
+    out_dCol[id] = in_dCol[id];
+    if (id == n - 1) out_dCol[n] = in_dCol[n];
 
-    int startB = b_dCol[id];
-    int endB = b_dCol[id + 1];
+    int in_start = in_dCol_res[id];
+    int in_end = in_dCol_res[id + 1];
 
-    while (startA < endA && startB < endB)
-    {
-        int aRow = a_dRow[startA];
-        int bRow = b_dRow[startB];
+    int out_start = out_dCol[id];
+    int out_end = out_dCol[id + 1];
 
-        if (aRow == bRow)
+    while (in_start < in_end)
+    {
+        int rowIdx = in_dRow[in_start];
+        if (rowIdx  >= 0) 
         {
-            c[IDX2C(aRow, id, m)] = a_dVal[startA] * b_dVal[startB];
-            startA++;
+            assert(out_start < out_end);
+            out_dRow[out_start] = in_dRow[in_start];
+            out_dVal[out_start] = in_dVal[in_start];
+            out_start++;
         }
-        else if (aRow < bRow) startA++;
-        else startB++;     
+        in_start++;
     }
 }
 

diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
@@ -18,7 +18,7 @@
 #include "CommonMatrix.h"
 #include <iostream> // for cout/cerr
 #include <assert.h>
-
+#include <cub/cub.cuh>
 typedef unsigned char byte;
 
 #pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
@@ -2006,37 +2006,86 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<
     return GPUSparseMatrix<ElemType>::ElementProductOf(b, a);
 }
 
+// use cun lib
 // sparse x sparse = sparse
 template <class ElemType>
 GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
 {
-    if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
-        NOT_IMPLEMENTED;
+	if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
+		NOT_IMPLEMENTED;
 
-    if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
-        LogicError("ElementProductOf: matrix dimensions mismatch");
+	if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+		LogicError("ElementProductOf: matrix dimensions mismatch");
 
-    b.PrepareDevice();
-    GPUMatrix<ElemType> c(b.GetNumRows(), b.GetNumCols(), b.GetComputeDeviceId());
+	b.PrepareDevice();
+	GPUSparseMatrix<ElemType> aCopy(a);
 
-    int m = (int)a.GetNumRows();
-    CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
-    int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
-    SyncGuard syncGuard;
-    _sparseCSCElemMulsparseCSC<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(m, n, a.Data(), a.RowLocation(), a.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation(), c.Data());
+	int m = (int)a.GetNumRows();
+	CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
+	int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
+	SyncGuard syncGuard;
+	_sparseCSCElemMulsparseCSC_Mark<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock >> >(m, n, aCopy.Data(), aCopy.RowLocation(), aCopy.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation());
+
+    //int* tmp = new int[100];
+    //cudaMemcpy(tmp, aCopy.ColLocation(), sizeof(int) * (aCopy.GetNumCols()+1), cudaMemcpyDeviceToHost);
 
-    // convert dense to sparse CSC
-    GPUSparseMatrix<ElemType> cs(c, MatrixFormat::matrixFormatSparseCSC);
+    //for (int i = 0; i <= n; i++)
+    //    cout << tmp[i] << " " << endl;
+	// reduce the segemented column index to obtain the correct column index
 
-    ////GPUMatrix<ElemType> ad(a.GetComputeDeviceId());
-    ////a.CopyToDenseMatrix(ad);
-    //ElemType *carr = c.CopyToArray();
+    GPUSPARSE_INDEX_TYPE* dev_agg = 0;
+    CUDA_CALL(cudaMalloc((void**)&dev_agg, sizeof(GPUSPARSE_INDEX_TYPE)));
+    CUDA_CALL(cudaMemset((void *)dev_agg, 0, sizeof(GPUSPARSE_INDEX_TYPE)));
 
-    //for (int i = 0; i < m*n; i++) if (carr[i] > 0.0) fprintf(stderr, "%10.4f ", carr[i]);
+    for (int i = 0; i < blocksPerGrid; i++)
+    {
+        _sparseCSCElemMulsparseCSC_Scan<GridDim::maxThreadsPerBlock, ElemType> << <1, GridDim::maxThreadsPerBlock >> >(i, n, aCopy.ColLocation(), dev_agg);
+    }
+    //cudaMemcpy(tmp, aCopy.ColLocation(), sizeof(int) * (n + 1), cudaMemcpyDeviceToHost);
+    //for (int i = 0; i <= n; i++)
+    //    cout << tmp[i] << " " << endl;
 
-    return cs;
+    int newNumNZ = 0;
+    CUDA_CALL(cudaMemcpy(&newNumNZ, dev_agg, sizeof(int), cudaMemcpyDeviceToHost));
+    GPUSparseMatrix<ElemType> c(a.GetNumRows(), a.GetNumCols(), newNumNZ, a.GetComputeDeviceId(), MatrixFormat::matrixFormatSparseCSC);
+    _sparseCSCElemMulsparseCSC_Update<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(n, aCopy.Data(), aCopy.RowLocation(), aCopy.ColLocation(), a.ColLocation(), c.Data(), c.RowLocation(), c.ColLocation());
+	
+    CUDA_CALL(cudaFree(dev_agg));
+    return c;
 }
 
+
+//// sparse x sparse = sparse
+//template <class ElemType>
+//GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
+//{
+//    if (a.GetFormat() == matrixFormatSparseCSR || b.GetFormat() == matrixFormatSparseCSR)
+//        NOT_IMPLEMENTED;
+//
+//    if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
+//        LogicError("ElementProductOf: matrix dimensions mismatch");
+//
+//    b.PrepareDevice();
+//    GPUMatrix<ElemType> c(b.GetNumRows(), b.GetNumCols(), b.GetComputeDeviceId());
+//
+//    int m = (int)a.GetNumRows();
+//    CUDA_LONG n = (CUDA_LONG)a.GetNumCols();
+//    int blocksPerGrid = (int)ceil(1.0 * n / GridDim::maxThreadsPerBlock);
+//    SyncGuard syncGuard;
+//    _sparseCSCElemMulsparseCSC<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(m, n, a.Data(), a.RowLocation(), a.ColLocation(), b.Data(), b.RowLocation(), b.ColLocation(), c.Data());
+//
+//    // convert dense to sparse CSC
+//    GPUSparseMatrix<ElemType> cs(c, MatrixFormat::matrixFormatSparseCSC);
+//
+//    ////GPUMatrix<ElemType> ad(a.GetComputeDeviceId());
+//    ////a.CopyToDenseMatrix(ad);
+//    //ElemType *carr = c.CopyToArray();
+//
+//    //for (int i = 0; i < m*n; i++) if (carr[i] > 0.0) fprintf(stderr, "%10.4f ", carr[i]);
+//
+//    return cs;
+//}
+
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
 {