forked from taichi-dev/taichi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu_reduction.cu
56 lines (50 loc) · 1.41 KB
/
gpu_reduction.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>
#include "get_time.h"
__inline__ __device__ int warpReduceSum(int val) {
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(val, offset, 0xFFFFFFFF);
return val;
}
__global__ void cpy(int *a, int *b, int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
while (i < n) {
int val = b[i];
sum += val;
i += blockDim.x * gridDim.x;
}
atomicAdd(a, sum);
}
int main() {
int n = 1024 * 1024 * 1024 / 4;
int *a, *b;
cudaMalloc(&a, n * sizeof(float));
cudaMalloc(&b, n * sizeof(float));
int repeat = 25;
double t;
t = get_time();
for (int i = 0; i < repeat; i++) {
cudaMemcpyAsync(a, b, n * sizeof(float), cudaMemcpyDeviceToDevice, 0);
}
cudaDeviceSynchronize();
t = (get_time() - t) / repeat;
printf("cuMemcpyAsync 8GB data bw %.3f GB/s\n",
n * 8.0 / t / (1024 * 1024 * 1024.0f));
for (auto bs : {32, 64, 128, 256, 512, 1024}) {
for (int i = 0; i < 10; i++) {
cpy<<<896, bs>>>(a, b, n);
}
cudaDeviceSynchronize();
t = get_time();
for (int i = 0; i < repeat; i++) {
cpy<<<896, bs>>>(a, b, n);
}
cudaDeviceSynchronize();
t = (get_time() - t) / repeat;
printf("reducing 4 GB data, block_dim %d, %.2f ms bw %.3f GB/s\n", bs,
t * 1000, n * 4.0 / t / (1024 * 1024 * 1024.0f));
}
}