Merge branch 'DanielMerget/fix_atomicAdd' of https://github.com/Danie…

…lMerget/CNTK into DanielMerget-DanielMerget/fix_atomicAdd
realentertain · Aug 25, 2016 · 58b7186 · 58b7186
2 parents aeee8f6 + 6eaacc7
commit 58b7186
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -42,8 +42,8 @@
 
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
 
-// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
-// TODO: This may need to be guarded by CUDA version; newer devices may support this.
+// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull = (unsigned long long int*) address;
@@ -55,6 +55,7 @@ static __inline__ __device__ double atomicAdd(double* address, double val)
     } while (assumed != old);
     return __longlong_as_double(old);
 }
+#endif
 
 // TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
 // The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.