@@ -885,11 +885,11 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
885
885
CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx'
886
886
887
887
auto jInF = idx[jOut * idxStride]; // this is the column we need to get
888
- if (jInF < 0 ) // negative index means gap
888
+ if (isnan ( jInF) || jInF < 0 ) // negative index means gap
889
889
return ;
890
890
size_t jIn = (size_t )jInF;
891
- if (jIn >= aCols)
892
- return ; // actually a failure
891
+ // if (jIn >= aCols)
892
+ // return; // actually a failure
893
893
894
894
const ElemType& ra = a[ i + jIn * aStride ];
895
895
ElemType& rus = us[id/* i + jOut * usStride*/ ];
@@ -928,6 +928,21 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
928
928
return *this ;
929
929
}
930
930
931
+ // little helper for debugging
932
+ template <class ElemType >
933
+ static void Peek (const GPUMatrix<ElemType>& m, const char * which)
934
+ {
935
+ size_t rows = m.GetNumRows ();
936
+ size_t cols = m.GetNumCols ();
937
+ ElemType buf[10000 ] = { 0 };
938
+ size_t n = min (rows * cols, _countof (buf));
939
+ CUDA_CALL (cudaMemcpy (buf, m.Data (), sizeof (ElemType) * n, cudaMemcpyDeviceToHost));
940
+ UNUSED (which); UNUSED (rows); UNUSED (cols); sin (1 .0f ); // set breakpoint here
941
+ // CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
942
+ }
943
+
944
+ #undef ALLOW_ATOMIC_SCATTER // allow to disable this, until we know atomicAdd() works properly here
945
+
931
946
template <class ElemType >
932
947
__global__ void _doScatterColumnsOf (ElemType* us, size_t usStride, size_t usCols, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, const ElemType alpha, CUDA_LONG numElements)
933
948
{
@@ -941,34 +956,25 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
941
956
CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx'
942
957
943
958
auto jOutF = idx[jIn * idxStride]; // this is the column we copy/add into
944
- if (jOutF < 0 ) // negative index means gap
959
+ if (isnan ( jOutF) || jOutF < 0 ) // negative index means gap
945
960
return ;
946
961
size_t jOut = (size_t )jOutF;
947
- if (jOut >= usCols)
948
- return ; // actually a failure --TODO: This should not be necessary. Why is it?
962
+ // if (jOut >= usCols)
963
+ // return; // actually a failure --TODO: This should not be necessary. Why is it?
949
964
950
965
const ElemType& ra = a[id/* i + jIn * aStride*/ ];
951
966
ElemType& rus = us[ i + jOut * usStride ];
952
967
953
968
ElemType res = ra * alpha;
969
+ #ifdef ALLOW_ATOMIC_SCATTER
954
970
if (res != 0 ) // avoid memory conflict if e.g. an entire column has no gradient
955
971
atomicAdd (&rus, res); // rus += res;
972
+ #else
973
+ rus += res;
974
+ #endif
956
975
// Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter())
957
976
}
958
977
959
- // little helper for debugging
960
- template <class ElemType >
961
- static void Peek (const GPUMatrix<ElemType>& m, const char * which)
962
- {
963
- size_t rows = m.GetNumRows ();
964
- size_t cols = m.GetNumCols ();
965
- ElemType buf[10000 ] = { 0 };
966
- size_t n = min (rows * cols, _countof (buf));
967
- CUDA_CALL (cudaMemcpy (buf, m.Data (), sizeof (ElemType) * n, cudaMemcpyDeviceToHost));
968
- UNUSED (which); UNUSED (rows); UNUSED (cols); sin (1 .0f ); // set breakpoint here
969
- // CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
970
- }
971
-
972
978
// *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta
973
979
template <class ElemType >
974
980
GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& idx, const GPUMatrix<ElemType>& a, ElemType alpha)
@@ -986,6 +992,27 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
986
992
987
993
auto & us = *this ;
988
994
995
+ #ifndef ALLOW_ATOMIC_SCATTER // verify that atomicAdd is not needed --this is not efficient
996
+ {
997
+ vector<ElemType> buf (idx.GetNumRows () * idx.GetNumCols ()); // idx(,)are the column(s) we copy/add into
998
+ CUDA_CALL (cudaMemcpy (buf.data (), idx.Data (), sizeof (ElemType) * buf.size (), cudaMemcpyDeviceToHost));
999
+ vector<bool > writtenTo (GetNumCols (), false ); // remember whether an output column is in fact a target
1000
+ for (size_t i = 0 ; i < buf.size (); i++)
1001
+ {
1002
+ auto colF = buf[i];
1003
+ if (isnan (colF) || colF < 0 )
1004
+ continue ;
1005
+ size_t col = (size_t )colF;
1006
+ if (col >= GetNumCols ())
1007
+ LogicError (" DoScatterColumnsOf: Index value out of bounds." );
1008
+ if (writtenTo[col])
1009
+ LogicError (" DoScatterColumnsOf: #ifndef ALLOW_ATOMIC_SCATTER then columns must be unique. Column idx(%d,%d)=%d is used twice." , (int )(i % idx.GetNumCols ()), (int )(i / idx.GetNumCols ()), (int )col);
1010
+ else
1011
+ writtenTo[col] = true ;
1012
+ }
1013
+ }
1014
+ #endif
1015
+
989
1016
// pre-scale with beta upfront
990
1017
// Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
991
1018
Scale (beta, us); // if beta is 0, then this will be a memset()
0 commit comments