forked from CalvinXKY/BasicCUDA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzeroCopy.cu
executable file
·219 lines (184 loc) · 7.73 KB
/
zeroCopy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
* zero copy using in vectorAdd case.
*
* This demo code might be stale with the development of CUDA.
* To use the latest API operations, you could see NVIDIA guide:
* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
* https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
*
* Author: kevin.xie
* Email: [email protected]
*/
#include "memoryOpt.h"
#include "timer.h"
__global__ void vectorAdd(const float *A, const float *B, float *C, const int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C[i] = A[i] + B[i] + 0.0f;
}
}
float vectorAddViaGlobalMemory(const unsigned int numElements, const unsigned int iterNum)
{
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float throughputInGBs = 0.0f;
sdkCreateTimer(&timer);
size_t memSize = numElements * sizeof(float);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
// Allocate the host input vector A, B, C
float *h_A = (float *)malloc(memSize);
float *h_B = (float *)malloc(memSize);
float *h_C = (float *)malloc(memSize);
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Allocate the device input vector:
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
checkCudaErrors(cudaMalloc((void **)&d_A, memSize));
checkCudaErrors(cudaMalloc((void **)&d_B, memSize));
checkCudaErrors(cudaMalloc((void **)&d_C, memSize));
for (unsigned int i = 0; i < iterNum; i++) {
sdkStartTimer(&timer);
checkCudaErrors(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
checkCudaErrors(cudaGetLastError());
// Copy the device result vector in device memory to the host result vector in host memory.
checkCudaErrors(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
// calculate throughput in GB/s. Note: use 1000(not 1024)unit.
double time_s = elapsedTimeInMs / 1e3;
throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
throughputInGBs = throughputInGBs / time_s;
sdkDeleteTimer(&timer);
// Free device global memory
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
return throughputInGBs;
}
float vectorAddViaZeroCopy(const unsigned int numElements, const unsigned int iterNum)
{
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float throughputInGBs = 0.0f;
sdkCreateTimer(&timer);
size_t memSize = numElements * sizeof(float);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
// Allocate the host input vector A, B, C
float *h_A = NULL;
float *h_B = NULL;
float *h_C = NULL;
float *map_A, *map_B, *map_C;
// Policy1:
// checkCudaErrors(cudaMallocHost((void **)&h_A, memSize));
// checkCudaErrors(cudaMallocHost((void **)&h_B, memSize));
// checkCudaErrors(cudaMallocHost((void **)&h_C, memSize));
// Policy2:
checkCudaErrors(cudaHostAlloc((void **)&h_A, memSize, cudaHostAllocMapped));
checkCudaErrors(cudaHostAlloc((void **)&h_B, memSize, cudaHostAllocMapped));
checkCudaErrors(cudaHostAlloc((void **)&h_C, memSize, cudaHostAllocMapped));
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Get the device pointers for the pinned CPU memory mapped into the GPU memory space.
checkCudaErrors(cudaHostGetDevicePointer(&map_A, h_A, 0));
checkCudaErrors(cudaHostGetDevicePointer(&map_B, h_B, 0));
checkCudaErrors(cudaHostGetDevicePointer(&map_C, h_C, 0));
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Copy the host input vectors A and B in host memory to the device input vectors in device memory
for (unsigned int i = 0; i < iterNum; i++) {
sdkStartTimer(&timer);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(map_A, map_B, map_C, numElements);
checkCudaErrors(cudaGetLastError());
// Copy the device result vector in device memory to the host result vector in host memory.
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
checkCudaErrors(cudaDeviceSynchronize());
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
// calculate throughput in GB/s. Note: use 1000(not 1024)unit.
double time_s = elapsedTimeInMs / 1e3;
throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
throughputInGBs = throughputInGBs / time_s;
sdkDeleteTimer(&timer);
// Free host memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
return throughputInGBs;
}
int main(int argc, char **argv)
{
printf("[Zero Copy Opt Vector Add] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -size=The size of numElements for testing in bytes. Default: 5000000)\n");
printf(" -iter=n Iteration numbers of trans. Default:1 \n");
exit(EXIT_SUCCESS);
}
unsigned int numElements = 5000000;
unsigned int iterNumbers = 1;
unsigned int gpuID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
}
if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size");
}
if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
}
checkCudaErrors(cudaSetDevice(gpuID));
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpuID);
if (!prop.canMapHostMemory)
exit(EXIT_FAILURE);
printf(">. Data tranfer via global memory. VectorAdd throughput: %f GB/s\n",
vectorAddViaGlobalMemory(numElements, iterNumbers));
printf(">. Data tranfer via zero copy. VectorAdd throughput: %f GB/s\n",
vectorAddViaZeroCopy(numElements, iterNumbers));
exit(EXIT_SUCCESS);
}