forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTHCReduceApplyUtils.cuh
81 lines (65 loc) · 2.05 KB
/
THCReduceApplyUtils.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#ifndef THC_REDUCE_APPLY_UTILS_INC
#define THC_REDUCE_APPLY_UTILS_INC
#include <cuda.h>
#include <assert.h>
#include "THCGeneral.h"
#include "THCTensor.h"
#include "THCDeviceUtils.cuh"
#include "THCTensorInfo.cuh"
// Enum that indicates whether tensor arguments are read/write or
// read-only
enum TensorArgType { ReadWrite, ReadOnly };
template <typename IndexType>
__device__ __forceinline__ IndexType getLinearBlockId() {
return blockIdx.z * gridDim.y * gridDim.x +
blockIdx.y * gridDim.x +
blockIdx.x;
}
// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
// return the reduced value
template <typename T, typename ReduceOp>
__device__ T reduceBlock(T* smem,
int numVals,
T threadVal,
ReduceOp reduceOp,
T init) {
if (numVals == 0) {
return init;
}
if (threadIdx.x < numVals) {
smem[threadIdx.x] = threadVal;
}
// First warp will perform reductions across warps
__syncthreads();
if ((threadIdx.x / warpSize) == 0) {
T r = threadIdx.x < numVals ? smem[threadIdx.x] : init;
for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) {
r = reduceOp(r, smem[i]);
}
smem[threadIdx.x] = r;
}
// First thread will perform reductions across the block
__syncthreads();
T r = init;
if (threadIdx.x == 0) {
r = smem[0];
int numLanesParticipating = min(numVals, warpSize);
if (numLanesParticipating == 32) {
// Unroll for warpSize == 32 and numVals >= 32
#pragma unroll
for (int i = 1; i < 32; ++i) {
r = reduceOp(r, smem[i]);
}
} else {
for (int i = 1; i < numLanesParticipating; ++i) {
r = reduceOp(r, smem[i]);
}
}
}
return r;
}
// Make sure the given tensor doesn't have too many dimensions
void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
// Produces a grid with at least one point per tile
THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
#endif // THC_REDUCE_APPLY_UTILS_INC