Skip to content

Commit

Permalink
Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga…
Browse files Browse the repository at this point in the history
…/separate1bitDataParallelSGD
  • Loading branch information
amitaga committed Jan 15, 2016
2 parents a8029ea + 9241f30 commit 8ae5176
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 118 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Linux users should clone from this URL: https://git.codeplex.com/cntk

git clone https://git.codeplex.com/cntk

More detail you can follow this thread:
For more details you can follow this thread:
http://codeplex.codeplex.com/workitem/26133


Expand Down Expand Up @@ -138,7 +138,7 @@ Make sure the following CUDA environment variables are set to the correct path
CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
CUDA_PATH_V7_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0

Open the CNTKSolution and build the CNTK project.
Open the CNTK solution and build the CNTK project.

Note: If you make modifications to the code, please first disable the
insertion of TAB characters. If you use Visual Studio as your editor,
Expand Down
37 changes: 18 additions & 19 deletions Source/Math/ConvolutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c()); UNUSED(packedInputRows);

// GPU and 1-dimensional image
bool gpuSparse1D = (inT.h() == 1 &&
m_gpuSparseOpt = (filterT.h() == 1 &&
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
convDesc.wStride() == 1 &&
!convDesc.padding() &&
in.GetMatrixType() == MatrixType::SPARSE);
m_gpuSparse1D = (m_gpuSparseOpt && inT.h() == 1);

out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);

// Reshaping is only necessary if we are going to use the unpacking trick
if (!gpuSparse1D)
if (m_gpuSparseOpt)
out.Reshape(outT.c() * outT.w(), outT.h() * batchSize);
else
out.Reshape(outT.c(), outputSizePerChannel * batchSize);

size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
Expand All @@ -75,17 +78,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
// [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
// [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
if (in.GetMatrixType() == MatrixType::DENSE)
if (in.GetMatrixType() == MatrixType::DENSE || m_gpuSparse1D)
inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
else
inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());

if (gpuSparse1D)
if (m_gpuSparseOpt)
{
if (filterT.w() * inT.c() != filter.GetNumCols())
LogicError("Kernel width and weight matrix dimensions don't match.");

Mat outputSubBatch = out.ColumnSlice(startSampleId, smallBatchSize);
inputSubBatch.Reshape(inT.c() * inT.w(), inT.h() * smallBatchSize);
Mat outputSubBatch = out.ColumnSlice(startSampleId, outT.h() * smallBatchSize);
Mat::ConvolveAndWeightedAdd(1, filter, false, inputSubBatch, false, 0, outputSubBatch,
static_cast<int>(inT.c()), convDesc.wStride(), convDesc.padding(), true);
}
Expand Down Expand Up @@ -195,14 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;

// GPU and 1-dimensional image
bool gpuSparse1D = (inT.h() == 1 &&
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
convDesc.wStride() == 1 &&
!convDesc.padding() &&
in.GetMatrixType() == MatrixType::SPARSE);

if (numSubBatches == 1 && allowReuse && !gpuSparse1D) //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
if (numSubBatches == 1 && allowReuse && !m_gpuSparseOpt) //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
// REVIEW alexeyk: the following makes an assumption that data in workspace was filled by Forward call and remained unchanged. Find way to enforce/verify that.
Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, filter);
else
Expand All @@ -218,19 +215,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
// [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
// [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
if (gpuSparse1D)
if (m_gpuSparseOpt)
{
Matrix<ElemType> inputSubBatch;
inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w());
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w() * inT.h());
Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize, inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize * inT.h(), inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);

Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize, srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.h() * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize * srcGradT.h(), srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);

filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize, convDesc.wStride(), convDesc.padding(), false);
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize * inT.h(), convDesc.wStride(), convDesc.padding(), false);
filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
}
else
Expand Down Expand Up @@ -314,6 +311,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
private:
size_t m_maxTempMemSizeInSamples;
Mat m_ones;
bool m_gpuSparseOpt;
bool m_gpuSparse1D;
};

template class ConvolutionEngine<float>;
Expand Down
73 changes: 50 additions & 23 deletions Source/Math/GPUMatrixCUDAKernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
size_t nz)
{
CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
if (N >= nz || N < aColCSCIndex[0])
if (N < aColCSCIndex[0] || N >= aColCSCIndex[T])
return;

size_t col;
Expand All @@ -1309,9 +1309,10 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
size_t nc = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped

int rowIdx = start;
for (size_t na_i = start; na_i < end; na_i++)
for (size_t j = start; j < end; j++)
{
// recover the 5 indices from the loop counter
size_t na_i = aRowIndex[j];
size_t d_i = (na_i ) % D;
size_t s_i = (na_i / D ) % S;
size_t m_i = (na_i / D / S ) % M;
Expand All @@ -1328,7 +1329,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
cnzValues[rowIdx] = anzValues[N];
cRowIndex[rowIdx] = nc;

if (N == nz - 1)
if (N == 0)
{
for (int i = 0; i <= T; i++)
{
Expand Down Expand Up @@ -2788,35 +2789,57 @@ __global__ void _isValid(
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= cols)
if (id >= cols || d_res[0] <= 0)
return;

int start = colCSCIndex[id];
int end = colCSCIndex[id + 1];
d_res[0] = 1;

if (start > end)
{
d_res[0] = -1;
d_res[1] = start;
d_res[2] = end;
if (d_res[0] > 0)
{
d_res[0] = -1;
d_res[1] = id;
d_res[2] = start;
d_res[3] = end;
}
}
else if (end > nz)
{
d_res[0] = -2;
d_res[1] = end;
d_res[2] = nz;
if (d_res[0] > 0)
{
d_res[0] = -2;
d_res[1] = id + 1;
d_res[2] = end;
d_res[3] = nz;
}
}
else
{
for (int j = start; j < end; j++) //j points to the value
{
if (rowIndex[j] > rows)
if (rowIndex[j] >= rows)
{
d_res[0] = -3;
d_res[1] = rowIndex[j];
d_res[2] = rows;
break;
if (d_res[0] > 0)
{
d_res[0] = -3;
d_res[1] = j;
d_res[2] = rowIndex[j];
d_res[3] = rows;
break;
}
}
if (j > start && rowIndex[j] < rowIndex[j - 1])
{
if (d_res[0] > 0)
{
d_res[0] = -4;
d_res[1] = id;
d_res[2] = j;
d_res[3] = rowIndex[j];
break;
}
}
}
}
Expand All @@ -2825,14 +2848,18 @@ __global__ void _isValid(
template<class ElemType>
__global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
GPUSPARSE_INDEX_TYPE* colCSCIndex,
const int cols
const int cols,
const int nz
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= cols)
return;

colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];

if (id == cols - 1)
colCSCIndex[cols] = nz;
}

//c = alpha * op(a) * op(b) + beta*c
Expand Down Expand Up @@ -2958,10 +2985,10 @@ __global__ void _dense1DConvMultSparseCSCTransposeAndAddToDense(

template<class ElemType>
__global__ void _reshape(
int oldNumRows, // old row count
int oldNumCols, // old col count
int newNumRows, // new row count
int newNumCols, // new col count
const int oldNumRows, // old row count
const int oldNumCols, // old col count
const int newNumRows, // new row count
const int newNumCols, // new col count
const GPUSPARSE_INDEX_TYPE* oldRowIndex, // old row index array
const GPUSPARSE_INDEX_TYPE* oldColumnIndex, // old column index array
GPUSPARSE_INDEX_TYPE* newRowIndex, // new row index array
Expand All @@ -2978,10 +3005,10 @@ __global__ void _reshape(
// initialize to the end and then scan in the right direction in the for-loop
int currentColStart = oldColumnIndex[oldNumCols];

for (int oldCol = oldColLower; oldCol <= oldNumCols; oldCol++)
for (int oldCol = oldColLower; oldCol < oldNumCols; oldCol++)
{
int start = oldColumnIndex[oldCol];
int end = (oldCol < oldNumCols) ? oldColumnIndex[oldCol + 1] : oldColumnIndex[oldNumCols] + 1;
int end = oldColumnIndex[oldCol + 1];
bool done = false;

for (int j = start; j < end; j++) //j points to the value
Expand Down
Loading

0 comments on commit 8ae5176

Please sign in to comment.