Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga…

…/separate1bitDataParallelSGD
zhangyp07 · Jan 15, 2016 · 8ae5176 · 8ae5176
2 parents a8029ea + 9241f30
commit 8ae5176
Show file tree

Hide file tree

Showing 9 changed files with 167 additions and 118 deletions.
diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ Linux users should clone from this URL: https://git.codeplex.com/cntk
 
     git clone https://git.codeplex.com/cntk
 
-More detail you can follow this thread:
+For more details you can follow this thread:
 http://codeplex.codeplex.com/workitem/26133
 
 
@@ -138,7 +138,7 @@ Make sure the following CUDA environment variables are set to the correct path
     CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
     CUDA_PATH_V7_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 
-Open the CNTKSolution and build the CNTK project.
+Open the CNTK solution and build the CNTK project.
 
 Note: If you make modifications to the code, please first disable the
 insertion of TAB characters. If you use Visual Studio as your editor,

diff --git a/Source/Math/ConvolutionEngine.cpp b/Source/Math/ConvolutionEngine.cpp
@@ -49,16 +49,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c()); UNUSED(packedInputRows);
 
             // GPU and 1-dimensional image
-            bool gpuSparse1D = (inT.h() == 1 &&
+            m_gpuSparseOpt = (filterT.h() == 1 &&
                 in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
                 convDesc.wStride() == 1 &&
                 !convDesc.padding() &&
                 in.GetMatrixType() == MatrixType::SPARSE);
+            m_gpuSparse1D = (m_gpuSparseOpt && inT.h() == 1);
 
             out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
 
             // Reshaping is only necessary if we are going to use the unpacking trick
-            if (!gpuSparse1D)
+            if (m_gpuSparseOpt)
+                out.Reshape(outT.c() * outT.w(), outT.h() * batchSize);
+            else
                 out.Reshape(outT.c(), outputSizePerChannel * batchSize);
 
             size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
@@ -75,17 +78,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 // [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
                 // [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
                 // [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
-                if (in.GetMatrixType() == MatrixType::DENSE)
+                if (in.GetMatrixType() == MatrixType::DENSE || m_gpuSparse1D)
                     inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
                 else
                     inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
 
-                if (gpuSparse1D)
+                if (m_gpuSparseOpt)
                 {
                     if (filterT.w() * inT.c() != filter.GetNumCols())
                         LogicError("Kernel width and weight matrix dimensions don't match.");
 
-                    Mat outputSubBatch = out.ColumnSlice(startSampleId, smallBatchSize);
+                    inputSubBatch.Reshape(inT.c() * inT.w(), inT.h() * smallBatchSize);
+                    Mat outputSubBatch = out.ColumnSlice(startSampleId, outT.h() * smallBatchSize);
                     Mat::ConvolveAndWeightedAdd(1, filter, false, inputSubBatch, false, 0, outputSubBatch,
                         static_cast<int>(inT.c()), convDesc.wStride(), convDesc.padding(), true);
                 }
@@ -195,14 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
             size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
 
-            // GPU and 1-dimensional image
-            bool gpuSparse1D = (inT.h() == 1 &&
-                in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
-                convDesc.wStride() == 1 &&
-                !convDesc.padding() &&
-                in.GetMatrixType() == MatrixType::SPARSE);
-
-            if (numSubBatches == 1 && allowReuse && !gpuSparse1D)  //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
+            if (numSubBatches == 1 && allowReuse && !m_gpuSparseOpt)  //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
                 // REVIEW alexeyk: the following makes an assumption that data in workspace was filled by Forward call and remained unchanged. Find way to enforce/verify that.
                 Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, filter);
             else
@@ -218,19 +215,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     // [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
                     // [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
                     // [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
-                    if (gpuSparse1D)
+                    if (m_gpuSparseOpt)
                     {
                         Matrix<ElemType> inputSubBatch;
                         inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
-                        inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w());
+                        inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w() * inT.h());
                         Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
-                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize, inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
+                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize * inT.h(), inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
 
-                        Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
-                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize, srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
+                        Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.h() * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
+                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize * srcGradT.h(), srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
 
                         filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
-                        Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize, convDesc.wStride(), convDesc.padding(), false);
+                        Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize * inT.h(), convDesc.wStride(), convDesc.padding(), false);
                         filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
                     }
                     else
@@ -314,6 +311,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     private:
         size_t m_maxTempMemSizeInSamples;
         Mat m_ones;
+        bool m_gpuSparseOpt;
+        bool m_gpuSparse1D;
     };
 
     template class ConvolutionEngine<float>;

diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -1285,7 +1285,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
     size_t nz)
 {
     CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x;   // input tensor of dimension (D x S x M x K x T)
-    if (N >= nz || N < aColCSCIndex[0])
+    if (N < aColCSCIndex[0] || N >= aColCSCIndex[T])
         return;
 
     size_t col;
@@ -1309,9 +1309,10 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
     size_t nc = ((s * M + m) * K + k) * D + d;    // output tensor of dimension (D x K x M x S): k/K and s/S swapped
 
     int rowIdx = start;
-    for (size_t na_i = start; na_i < end; na_i++)
+    for (size_t j = start; j < end; j++)
     {
         // recover the 5 indices from the loop counter
+        size_t na_i = aRowIndex[j];
         size_t d_i = (na_i              ) % D;
         size_t s_i = (na_i / D          ) % S;
         size_t m_i = (na_i / D / S      ) % M;
@@ -1328,7 +1329,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
     cnzValues[rowIdx] = anzValues[N];
     cRowIndex[rowIdx] = nc;
 
-    if (N == nz - 1)
+    if (N == 0)
     {
         for (int i = 0; i <= T; i++)
         {
@@ -2788,35 +2789,57 @@ __global__ void _isValid(
     )
 {
     CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id >= cols)
+    if (id >= cols || d_res[0] <= 0)
         return;
 
     int start = colCSCIndex[id];
     int end = colCSCIndex[id + 1];
-    d_res[0] = 1;
 
     if (start > end)
     {
-        d_res[0] = -1;
-        d_res[1] = start;
-        d_res[2] = end;
+        if (d_res[0] > 0)
+        {
+            d_res[0] = -1;
+            d_res[1] = id;
+            d_res[2] = start;
+            d_res[3] = end;
+        }
     }
     else if (end > nz)
     {
-        d_res[0] = -2;
-        d_res[1] = end;
-        d_res[2] = nz;
+        if (d_res[0] > 0)
+        {
+            d_res[0] = -2;
+            d_res[1] = id + 1;
+            d_res[2] = end;
+            d_res[3] = nz;
+        }
     }
     else
     {
         for (int j = start; j < end; j++)  //j points to the value
         {
-            if (rowIndex[j] > rows)
+            if (rowIndex[j] >= rows)
             {
-                d_res[0] = -3;
-                d_res[1] = rowIndex[j];
-                d_res[2] = rows;
-                break;
+                if (d_res[0] > 0)
+                {
+                    d_res[0] = -3;
+                    d_res[1] = j;
+                    d_res[2] = rowIndex[j];
+                    d_res[3] = rows;
+                    break;
+                }
+            }
+            if (j > start && rowIndex[j] < rowIndex[j - 1])
+            {
+                if (d_res[0] > 0)
+                {
+                    d_res[0] = -4;
+                    d_res[1] = id;
+                    d_res[2] = j;
+                    d_res[3] = rowIndex[j];
+                    break;
+                }
             }
         }
     }
@@ -2825,14 +2848,18 @@ __global__ void _isValid(
 template<class ElemType>
 __global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
     GPUSPARSE_INDEX_TYPE* colCSCIndex,
-    const int cols
+    const int cols,
+    const int nz
     )
 {
     CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
     if (id >= cols)
         return;
 
     colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];
+
+    if (id == cols - 1)
+        colCSCIndex[cols] = nz;
 }
 
 //c = alpha * op(a) * op(b) + beta*c
@@ -2958,10 +2985,10 @@ __global__ void _dense1DConvMultSparseCSCTransposeAndAddToDense(
 
 template<class ElemType>
 __global__ void _reshape(
-    int oldNumRows,                             // old row count
-    int oldNumCols,                             // old col count
-    int newNumRows,                             // new row count
-    int newNumCols,                             // new col count
+    const int oldNumRows,                       // old row count
+    const int oldNumCols,                       // old col count
+    const int newNumRows,                       // new row count
+    const int newNumCols,                       // new col count
     const GPUSPARSE_INDEX_TYPE* oldRowIndex,    // old row index array
     const GPUSPARSE_INDEX_TYPE* oldColumnIndex, // old column index array
     GPUSPARSE_INDEX_TYPE* newRowIndex,          // new row index array
@@ -2978,10 +3005,10 @@ __global__ void _reshape(
     // initialize to the end and then scan in the right direction in the for-loop
     int currentColStart = oldColumnIndex[oldNumCols];
 
-    for (int oldCol = oldColLower; oldCol <= oldNumCols; oldCol++)
+    for (int oldCol = oldColLower; oldCol < oldNumCols; oldCol++)
     {
         int start = oldColumnIndex[oldCol];
-        int end = (oldCol < oldNumCols) ? oldColumnIndex[oldCol + 1] : oldColumnIndex[oldNumCols] + 1;
+        int end = oldColumnIndex[oldCol + 1];
         bool done = false;
 
         for (int j = start; j < end; j++)  //j points to the value