reviewed uses of atomicAdd(),;

frankseide · frankseide · commit fd28c5508047 · 2016-05-05T11:02:21.000-07:00
temporarily disallowed Scatter() reductions
diff --git a/Source/Math/Convolution.cuh b/Source/Math/Convolution.cuh
@@ -269,4 +269,4 @@ __global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, cons
     }
 }
 
-} } }
+}}}
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
@@ -885,11 +885,11 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
     CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx'
 
     auto jInF = idx[jOut * idxStride]; // this is the column we need to get
-    if (jInF < 0)                      // negative index means gap
+    if (isnan(jInF) || jInF < 0)       // negative index means gap
         return;
     size_t jIn = (size_t)jInF;
-    if (jIn >= aCols)
-        return; // actually a failure
+    //if (jIn >= aCols)
+    //    return; // actually a failure
 
     const ElemType&  ra = a[    i + jIn  *  aStride  ];
     ElemType&       rus = us[id/*i + jOut * usStride*/];
@@ -928,6 +928,21 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
     return *this;
 }
 
+// little helper for debugging
+template <class ElemType>
+static void Peek(const GPUMatrix<ElemType>& m, const char* which)
+{
+    size_t rows = m.GetNumRows();
+    size_t cols = m.GetNumCols();
+    ElemType buf[10000] = { 0 };
+    size_t n = min(rows * cols, _countof(buf));
+    CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost));
+    UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
+    //CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
+}
+
+#undef ALLOW_ATOMIC_SCATTER // allow to disable this, until we know atomicAdd() works properly here
+
 template <class ElemType>
 __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, const ElemType alpha, CUDA_LONG numElements)
 {
@@ -941,34 +956,25 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
     CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx'
 
     auto jOutF = idx[jIn * idxStride]; // this is the column we copy/add into
-    if (jOutF < 0)                     // negative index means gap
+    if (isnan(jOutF) || jOutF < 0)     // negative index means gap
         return;
     size_t jOut = (size_t)jOutF;
-    if (jOut >= usCols)
-        return; // actually a failure  --TODO: This should not be necessary. Why is it?
+    //if (jOut >= usCols)
+    //    return; // actually a failure  --TODO: This should not be necessary. Why is it?
 
     const ElemType&  ra =  a[id/*i + jIn  *  aStride*/];
     ElemType&       rus = us[    i + jOut * usStride  ];
 
     ElemType res = ra * alpha;
+#ifdef ALLOW_ATOMIC_SCATTER
     if (res != 0)             // avoid memory conflict if e.g. an entire column has no gradient
         atomicAdd(&rus, res); // rus += res;
+#else
+    rus += res;
+#endif
     // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter())
 }
 
-// little helper for debugging
-template <class ElemType>
-static void Peek(const GPUMatrix<ElemType>& m, const char* which)
-{
-    size_t rows = m.GetNumRows();
-    size_t cols = m.GetNumCols();
-    ElemType buf[10000] = { 0 };
-    size_t n = min(rows * cols, _countof(buf));
-    CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost));
-    UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
-    //CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
-}
-
 // *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta
 template <class ElemType>
 GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& idx, const GPUMatrix<ElemType>& a, ElemType alpha)
@@ -986,6 +992,27 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
 
     auto& us = *this;
 
+#ifndef ALLOW_ATOMIC_SCATTER // verify that atomicAdd is not needed  --this is not efficient
+    {
+        vector<ElemType> buf(idx.GetNumRows() * idx.GetNumCols()); // idx(,)are the column(s) we copy/add into
+        CUDA_CALL(cudaMemcpy(buf.data(), idx.Data(), sizeof(ElemType) * buf.size(), cudaMemcpyDeviceToHost));
+        vector<bool> writtenTo(GetNumCols(), false); // remember whether an output column is in fact a target
+        for (size_t i = 0; i < buf.size(); i++)
+        {
+            auto colF = buf[i];
+            if (isnan(colF) || colF < 0)
+                continue;
+            size_t col = (size_t)colF;
+            if (col >= GetNumCols())
+                LogicError("DoScatterColumnsOf: Index value out of bounds.");
+            if (writtenTo[col])
+                LogicError("DoScatterColumnsOf: #ifndef ALLOW_ATOMIC_SCATTER then columns must be unique. Column idx(%d,%d)=%d is used twice.", (int)(i % idx.GetNumCols()), (int)(i / idx.GetNumCols()), (int)col);
+            else
+                writtenTo[col] = true;
+        }
+    }
+#endif
+
     // pre-scale with beta upfront
     // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
     Scale(beta, us); // if beta is 0, then this will be a memset()
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -43,6 +43,7 @@
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
 
 // CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
+// TODO: This may need to be guarded by CUDA version; newer devices may support this.
 static __inline__ __device__ double atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull = (unsigned long long int*) address;
@@ -3152,7 +3153,8 @@ __global__ void _scaleSparseBlockAndAddToDense(
     rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
 }
 
-// compute predictions in cross entory node
+#if 0
+// compute predictions in cross entropy node
 template <class ElemType>
 __global__ void _computePrediction(
     int nv,
@@ -3335,6 +3337,7 @@ __global__ void _computeGradientOfInput(
 
     atomicAdd(&grd[IDX2C(h, j, numrows)], sum);
 }
+#endif
 
 template <class ElemType>
 __global__ void computeNCEForwardProp(
@@ -3713,6 +3716,8 @@ __global__ void _assignNceDerivativeNew(
             atomicAdd(&c[wid], -er);
     }
 }
+
+#if 0
 // compute gradients of weights in cross entropy node
 template <class ElemType>
 __global__ void _computeGradientOfWeight(
@@ -3774,6 +3779,7 @@ __global__ void _computeGradientOfWeight(
         blockIds[ii] = i;
     }
 }
+#endif
 
 // used in clipping gradients
 template <class ElemType>
diff --git a/Source/Math/GPUTensor.cu b/Source/Math/GPUTensor.cu
@@ -393,6 +393,8 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
     }
 };
 
+#define ALLOW_ATOMIC_REDUCTION // undefine to disable use of atomicAdd() below, for testing it
+
 // specialization for k = -1 terminates the template recursion, and computes reductions in parallel
 template <class ElemType, C_size_t N, C_int M, C_int K>
 struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
@@ -403,8 +405,8 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
                                    const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
                                    const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
     {
-        CUDA_LONG reductionBlock = blockIdx.z; // block index  --larger reductions are split into blocks
-        CUDA_LONG reductionBlocks = gridDim.z; // number of blocks
+        CUDA_LONG reductionBlock = blockIdx.z; // reduction-block index  --larger reductions are split into blocks
+        CUDA_LONG reductionBlocks = gridDim.z; // number of reduction blocks. If >1 we need atomicAdd
         CUDA_LONG tid = threadIdx.x;           // thread index
         CUDA_LONG tids = blockDim.x;           // out of how many threads  --note: last block is partial
 
@@ -427,7 +429,7 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
         }
 
         // reduce    --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
-        __shared__ ReduceElemType accumulators[GridDim::maxThreadsPerBlock /*tids*/];
+        __shared__ ReduceElemType volatile accumulators[GridDim::maxThreadsPerBlock /*tids*/];
         accumulators[tid] = sum;
         __syncthreads();
         static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
@@ -450,8 +452,12 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
             auto* pout = pointers[pointers.size() - 1];
             if (reductionBlocks > 1) // multiple blocks: need to use atomicAdd()
             {
+#ifdef ALLOW_ATOMIC_REDUCTION
                 // in this case, outer calling code must pass beta = 1
                 atomicAdd(pout, val);
+#else
+                *pout = 1000000.0f;  // something that can't be missed? How to crash it?
+#endif
             }
             else
             {
@@ -560,8 +566,9 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
     C_size_t reductionDim = 1; // number of elements to reduce over
     for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
         reductionDim *= (C_size_t) reducingOpDimVector[k];
-    let& props = GridDim::GetDeviceProps();
     GridDim grid(NN);
+#ifdef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability
+    let& props = GridDim::GetDeviceProps();
     if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount) // TODO: <= multiProcessorCount?
     {
         // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
@@ -603,6 +610,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
         }
     }
     else
+#endif
     {
         // we got enough elements to generate: do one element per thread, and reduction inside
         _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
@@ -16,11 +16,7 @@
 #include "latticestorage.h"
 #include <limits>
 
-namespace msra { namespace cuda {
-
-class passtextureref;
-}
-}
+namespace msra { namespace cuda { class passtextureref; } }
 
 #ifdef CPUONLY
 #define __kernel_emulation__
@@ -34,7 +30,8 @@ using namespace std;
 #define __device__
 #endif
 #define CUDART_MIN_DENORM_F numeric_limits<float>::denorm_min()
-#define atomicAdd(address, value) (*(address) += (value)) // don't forget to #undef (#praga pop_macro)! Otherwise CUDA might compile with this...
+// renamed to x- so we make sure to not accidentally use these; rename back if ever needed again
+#define xatomicAdd(address, value) (*(address) += (value)) // don't forget to #undef (#praga pop_macro)! Otherwise CUDA might compile with this...
 #define atomicCAS(address, compare, val) \
     *address;                            \
     *address = *address == compare ? val : *address;
@@ -47,8 +44,8 @@ using namespace std;
 #if __CUDA_ARCH__ < 200
 //#warning Sequence training not supported on 1.x CUDA machines.
 #define force_crash() (*((int *) -1) = 0)         // TODO: this does not in fact seem to crash it...
-#define atomicAdd(a, v) (force_crash(), *(a) = v) // force a crash if used with 1.x devices
-#define atomicCAS(address, compare, val) (*(address) = compare + val, *((int *) -1) = 0)
+#define xatomicAdd(a, v) (force_crash(), *(a) = v) // force a crash if used with 1.x devices
+#define xatomicCAS(address, compare, val) (*(address) = compare + val, *((int *) -1) = 0)
 #define __double_as_longlong(in) (force_crash(), in)
 #define __longlong_as_double(in) (force_crash(), in)
 #define __float_as_int(in) (force_crash(), in)
@@ -956,8 +953,8 @@ struct latticefunctionskernels
         }
     }
 };
-};
-};
+
+}};
 
 #pragma pop_macro("atomicCAS")
 #pragma pop_macro("atomicAdd")

Original file line number	Diff line number	Diff line change
`@@ -269,4 +269,4 @@ __global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, cons`
`269`	`269`	`}`
`270`	`270`	`}`
`271`	`271`
`272`		`-} } }`
	`272`	`+}}}`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@`
`43`	`43`	`#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing`
`44`	`44`
`45`	`45`	`// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.`
	`46`	`+// TODO: This may need to be guarded by CUDA version; newer devices may support this.`
`46`	`47`	`static __inline__ __device__ double atomicAdd(double* address, double val)`
`47`	`48`	`{`
`48`	`49`	`unsigned long long int* address_as_ull = (unsigned long long int*) address;`
`@@ -3152,7 +3153,8 @@ __global__ void _scaleSparseBlockAndAddToDense(`
`3152`	`3153`	`rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];`
`3153`	`3154`	`}`
`3154`	`3155`
`3155`		`-// compute predictions in cross entory node`
	`3156`	`+#if 0`
	`3157`	`+// compute predictions in cross entropy node`
`3156`	`3158`	`template <class ElemType>`
`3157`	`3159`	`__global__ void _computePrediction(`
`3158`	`3160`	`int nv,`
`@@ -3335,6 +3337,7 @@ __global__ void _computeGradientOfInput(`
`3335`	`3337`
`3336`	`3338`	`atomicAdd(&grd[IDX2C(h, j, numrows)], sum);`
`3337`	`3339`	`}`
	`3340`	`+#endif`
`3338`	`3341`
`3339`	`3342`	`template <class ElemType>`
`3340`	`3343`	`__global__ void computeNCEForwardProp(`
`@@ -3713,6 +3716,8 @@ __global__ void _assignNceDerivativeNew(`
`3713`	`3716`	`atomicAdd(&c[wid], -er);`
`3714`	`3717`	`}`
`3715`	`3718`	`}`
	`3719`	`+`
	`3720`	`+#if 0`
`3716`	`3721`	`// compute gradients of weights in cross entropy node`
`3717`	`3722`	`template <class ElemType>`
`3718`	`3723`	`__global__ void _computeGradientOfWeight(`
`@@ -3774,6 +3779,7 @@ __global__ void _computeGradientOfWeight(`
`3774`	`3779`	`blockIds[ii] = i;`
`3775`	`3780`	`}`
`3776`	`3781`	`}`
	`3782`	`+#endif`
`3777`	`3783`
`3778`	`3784`	`// used in clipping gradients`
`3779`	`3785`	`template <class ElemType>`
Original file line number	Diff line number	Diff line change
`@@ -393,6 +393,8 @@ struct TensorOpElement<ElemType, N, M, K, /parallelReduce=/false, /k=/-1>`
`393`	`393`	`}`
`394`	`394`	`};`
`395`	`395`
	`396`	`+#define ALLOW_ATOMIC_REDUCTION // undefine to disable use of atomicAdd() below, for testing it`
	`397`	`+`
`396`	`398`	`// specialization for k = -1 terminates the template recursion, and computes reductions in parallel`
`397`	`399`	`template <class ElemType, C_size_t N, C_int M, C_int K>`
`398`	`400`	`struct TensorOpElement<ElemType, N, M, K, /parallelReduce=/true, /k=/-1>`
`@@ -403,8 +405,8 @@ struct TensorOpElement<ElemType, N, M, K, /parallelReduce=/true, /k=/-1>`
`403`	`405`	`const FixedArray<C_unsigned_int, K>& /regularOpStrides/, const FixedMatrix<C_int, N, K>& /regularStrides/,`
`404`	`406`	`const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)`
`405`	`407`	`{`
`406`		`- CUDA_LONG reductionBlock = blockIdx.z; // block index --larger reductions are split into blocks`
`407`		`- CUDA_LONG reductionBlocks = gridDim.z; // number of blocks`
	`408`	`+ CUDA_LONG reductionBlock = blockIdx.z; // reduction-block index --larger reductions are split into blocks`
	`409`	`+ CUDA_LONG reductionBlocks = gridDim.z; // number of reduction blocks. If >1 we need atomicAdd`
`408`	`410`	`CUDA_LONG tid = threadIdx.x; // thread index`
`409`	`411`	`CUDA_LONG tids = blockDim.x; // out of how many threads --note: last block is partial`
`410`	`412`
`@@ -427,7 +429,7 @@ struct TensorOpElement<ElemType, N, M, K, /parallelReduce=/true, /k=/-1>`
`427`	`429`	`}`
`428`	`430`
`429`	`431`	`// reduce --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf`
`430`		`- __shared__ ReduceElemType accumulators[GridDim::maxThreadsPerBlock /tids/];`
	`432`	`+ __shared__ ReduceElemType volatile accumulators[GridDim::maxThreadsPerBlock /tids/];`
`431`	`433`	`accumulators[tid] = sum;`
`432`	`434`	`__syncthreads();`
`433`	`435`	`static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");`
`@@ -450,8 +452,12 @@ struct TensorOpElement<ElemType, N, M, K, /parallelReduce=/true, /k=/-1>`
`450`	`452`	`auto* pout = pointers[pointers.size() - 1];`
`451`	`453`	`if (reductionBlocks > 1) // multiple blocks: need to use atomicAdd()`
`452`	`454`	`{`
	`455`	`+#ifdef ALLOW_ATOMIC_REDUCTION`
`453`	`456`	`// in this case, outer calling code must pass beta = 1`
`454`	`457`	`atomicAdd(pout, val);`
	`458`	`+#else`
	`459`	`+ *pout = 1000000.0f; // something that can't be missed? How to crash it?`
	`460`	`+#endif`
`455`	`461`	`}`
`456`	`462`	`else`
`457`	`463`	`{`
`@@ -560,8 +566,9 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point`
`560`	`566`	`C_size_t reductionDim = 1; // number of elements to reduce over`
`561`	`567`	`for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)`
`562`	`568`	`reductionDim *= (C_size_t) reducingOpDimVector[k];`
`563`		`- let& props = GridDim::GetDeviceProps();`
`564`	`569`	`GridDim grid(NN);`
	`570`	`+#ifdef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability`
	`571`	`+ let& props = GridDim::GetDeviceProps();`
`565`	`572`	`if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount) // TODO: <= multiProcessorCount?`
`566`	`573`	`{`
`567`	`574`	`// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel`
`@@ -603,6 +610,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point`
`603`	`610`	`}`
`604`	`611`	`}`
`605`	`612`	`else`
	`613`	`+#endif`
`606`	`614`	`{`
`607`	`615`	`// we got enough elements to generate: do one element per thread, and reduction inside`
`608`	`616`	`_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);`