Add randomization to GPU NCE update

raden · Jul 10, 2015 · 9ea1406 · 9ea1406
1 parent 6168015
commit 9ea1406
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 30 deletions.
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
@@ -1959,6 +1959,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync) CUDA_CALL(cudaEventDestroy(done));
     }
+
     template<class ElemType>
     void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
     {

diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -3123,6 +3123,7 @@ __global__ void _assignNceDerivative(
         }
     }
 }
+
 template<class ElemType>
 __global__ void _assignNceDerivativeNew(
     const ElemType* val,
@@ -3144,13 +3145,8 @@ __global__ void _assignNceDerivativeNew(
     // tmp is a matrix of precalculated error
     // c is the output matrix to store calculated gradients
 
-    // assume a 1 dimensional thread array
-    int tx = threadIdx.x; // thread index in thread-block (0-indexed)
-    int bx = blockIdx.x;  // block index (0-indexed)
-    int bdim = blockDim.x; // number of threads in thread block
-
     // logical single index for this thread
-    int n = tx + bdim*bx;
+    int n = threadIdx.x + blockDim.x* blockIdx.x;
 
     int batchId = n / sampleCount;
     int total = numRows * sampleCount;
@@ -3159,47 +3155,27 @@ __global__ void _assignNceDerivativeNew(
     {
         int wid = (int)val[2 * n];
         ElemType er = tmp[n];
-        //c[n] = a[n] + b[n]; // this thread does one addition
         if (inputIndex == 1)
         {
-            for (int j = 0; j < width; j++)
+            for (int i = 0; i < width; i++)
             {
+                int j = (i + n) % width; //introduce randomization to avoid conflicts
                 ElemType val = -er * b[IDX2C(j, wid, width)];
                 atomicAdd(&c[IDX2C(j, batchId, width)], val);
             }
         }
         else if (inputIndex == 2)
         {
-            for (int j = 0; j < width; j++)
+            for (int i = 0; i < width; i++)
             {
+                int j = (i + n) % width; //introduce randomization to avoid conflicts
                 ElemType val = -er * a[IDX2C(j, batchId, width)];
                 atomicAdd(&c[IDX2C(j, wid, width)], val);
             }
         }
         else
             atomicAdd(&c[wid], -er);
     }
-    /*
-    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
-
-    // find out the items this block is responsible for
-    int start = loadPerBlock * blockIdx.x;
-    int end = min(total, loadPerBlock * (blockIdx.x + 1));
-    // find out the items this block is responsible for
-  
-    for (int i = start; i < end; i++)
-    {
-        int wid = (int)val[2 * i];
-
-        ElemType er = tmp[i]; // precalculated error for this output node
-        if (inputIndex == 3) //bias vector
-        {
-            //ElemType val = -er;
-            atomicAdd(&c[wid], -er);
-            //c[wid] -= er;
-        }
-    }
-    */
 }
 // compute gradients of weights in cross entropy node
 template<class ElemType>