From fc2e6c2427329e89458db2a86f7fcb7f65e608a2 Mon Sep 17 00:00:00 2001
From: Daniel Merget <daniel.merget@tum.de>
Date: Fri, 19 Aug 2016 13:47:39 +0200
Subject: [PATCH 1/2] avoid double definition of atomicAdd on modern GPUs

---
 Source/Math/GPUMatrixCUDAKernels.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index 8521b457083e..0bedf51562db 100644
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -43,7 +43,7 @@
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
 
 // CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
-// TODO: This may need to be guarded by CUDA version; newer devices may support this.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull = (unsigned long long int*) address;
@@ -55,6 +55,7 @@ static __inline__ __device__ double atomicAdd(double* address, double val)
     } while (assumed != old);
     return __longlong_as_double(old);
 }
+#endif
 
 // TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
 // The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.

From 6eaacc7a98ec150fe4466e628985864e9953e4d8 Mon Sep 17 00:00:00 2001
From: Daniel Merget <daniel.merget@tum.de>
Date: Fri, 19 Aug 2016 14:15:51 +0200
Subject: [PATCH 2/2] clarified comment

---
 Source/Math/GPUMatrixCUDAKernels.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index 0bedf51562db..06add40b2b21 100644
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -42,7 +42,7 @@
 
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
 
-// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
+// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val)
 {