From fc2e6c2427329e89458db2a86f7fcb7f65e608a2 Mon Sep 17 00:00:00 2001 From: Daniel Merget Date: Fri, 19 Aug 2016 13:47:39 +0200 Subject: [PATCH 1/2] avoid double definition of atomicAdd on modern GPUs --- Source/Math/GPUMatrixCUDAKernels.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 8521b457083e..0bedf51562db 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -43,7 +43,7 @@ #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing // CUDA atomicAdd() only exists for 'float'. This is the 'double' version. -// TODO: This may need to be guarded by CUDA version; newer devices may support this. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*) address; @@ -55,6 +55,7 @@ static __inline__ __device__ double atomicAdd(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } +#endif // TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log(). // The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase. From 6eaacc7a98ec150fe4466e628985864e9953e4d8 Mon Sep 17 00:00:00 2001 From: Daniel Merget Date: Fri, 19 Aug 2016 14:15:51 +0200 Subject: [PATCH 2/2] clarified comment --- Source/Math/GPUMatrixCUDAKernels.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 0bedf51562db..06add40b2b21 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -42,7 +42,7 @@ #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing -// CUDA atomicAdd() only exists for 'float'. This is the 'double' version. +// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version. #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) {