forked from parallel101/course
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
356 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
|
||
__global__ void kernel() { | ||
printf("Block (%d,%d) of (%d,%d), Thread (%d,%d) of (%d,%d)\n", | ||
blockIdx.x, blockIdx.y, | ||
gridDim.x, gridDim.y, | ||
threadIdx.x, threadIdx.y, | ||
blockDim.x, blockDim.y); | ||
} | ||
|
||
int main() { | ||
kernel<<<dim3(2, 1, 1), dim3(3, 2, 1)>>>(); | ||
cudaDeviceSynchronize(); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) | ||
target_include_directories(main PUBLIC ../../include) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
#include "helper_cuda.h" | ||
#include <vector> | ||
#include "CudaAllocator.h" | ||
#include "ticktock.h" | ||
|
||
template <int blockSize, class T> | ||
__global__ void parallel_sum_kernel(T *sum, T const *arr, int n) { | ||
__shared__ volatile int local_sum[blockSize]; | ||
int j = threadIdx.x; | ||
int i = blockIdx.x; | ||
T temp_sum = 0; | ||
for (int t = i * blockSize + j; t < n; t += blockSize * gridDim.x) { | ||
temp_sum += arr[t]; | ||
} | ||
local_sum[j] = temp_sum; | ||
__syncthreads(); | ||
if constexpr (blockSize >= 1024) { | ||
if (j < 512) | ||
local_sum[j] += local_sum[j + 512]; | ||
__syncthreads(); | ||
} | ||
if constexpr (blockSize >= 512) { | ||
if (j < 256) | ||
local_sum[j] += local_sum[j + 256]; | ||
__syncthreads(); | ||
} | ||
if constexpr (blockSize >= 256) { | ||
if (j < 128) | ||
local_sum[j] += local_sum[j + 128]; | ||
__syncthreads(); | ||
} | ||
if constexpr (blockSize >= 128) { | ||
if (j < 64) | ||
local_sum[j] += local_sum[j + 64]; | ||
__syncthreads(); | ||
} | ||
if (j < 32) { | ||
if constexpr (blockSize >= 64) | ||
local_sum[j] += local_sum[j + 32]; | ||
if constexpr (blockSize >= 32) | ||
local_sum[j] += local_sum[j + 16]; | ||
if constexpr (blockSize >= 16) | ||
local_sum[j] += local_sum[j + 8]; | ||
if constexpr (blockSize >= 8) | ||
local_sum[j] += local_sum[j + 4]; | ||
if constexpr (blockSize >= 4) | ||
local_sum[j] += local_sum[j + 2]; | ||
if (j == 0) { | ||
sum[i] = local_sum[0] + local_sum[1]; | ||
} | ||
} | ||
} | ||
|
||
template <int reduceScale = 4096, int blockSize = 256, int cutoffSize = reduceScale * 2, class T> | ||
int parallel_sum(T const *arr, int n) { | ||
if (n > cutoffSize) { | ||
std::vector<int, CudaAllocator<int>> sum(n / reduceScale); | ||
parallel_sum_kernel<blockSize><<<n / reduceScale, blockSize>>>(sum.data(), arr, n); | ||
return parallel_sum(sum.data(), n / reduceScale); | ||
} else { | ||
checkCudaErrors(cudaDeviceSynchronize()); | ||
T final_sum = 0; | ||
for (int i = 0; i < n; i++) { | ||
final_sum += arr[i]; | ||
} | ||
return final_sum; | ||
} | ||
} | ||
|
||
int main() { | ||
int n = 1<<24; | ||
std::vector<int, CudaAllocator<int>> arr(n); | ||
std::vector<int, CudaAllocator<int>> sum(n / 4096); | ||
|
||
for (int i = 0; i < n; i++) { | ||
arr[i] = std::rand() % 4; | ||
} | ||
|
||
TICK(parallel_sum); | ||
int final_sum = parallel_sum(arr.data(), n); | ||
TOCK(parallel_sum); | ||
|
||
printf("result: %d\n", final_sum); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) | ||
target_include_directories(main PUBLIC ../../include) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
#include "helper_cuda.h" | ||
#include <vector> | ||
#include "CudaAllocator.h" | ||
#include "ticktock.h" | ||
|
||
template <class T> | ||
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) { | ||
int linearized = blockIdx.x * blockDim.x + threadIdx.x; | ||
int y = linearized / nx; | ||
int x = linearized % nx; | ||
if (x >= nx || y >= ny) return; | ||
out[y * nx + x] = in[x * nx + y]; | ||
} | ||
|
||
int main() { | ||
int nx = 1<<14, ny = 1<<14; | ||
std::vector<int, CudaAllocator<int>> in(nx * ny); | ||
std::vector<int, CudaAllocator<int>> out(nx * ny); | ||
|
||
for (int i = 0; i < nx * ny; i++) { | ||
in[i] = i; | ||
} | ||
|
||
TICK(parallel_transpose); | ||
parallel_transpose<<<nx * ny / 1024, 1024>>> | ||
(out.data(), in.data(), nx, ny); | ||
checkCudaErrors(cudaDeviceSynchronize()); | ||
TOCK(parallel_transpose); | ||
|
||
for (int y = 0; y < ny; y++) { | ||
for (int x = 0; x < nx; x++) { | ||
if (out[y * nx + x] != in[x * nx + y]) { | ||
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y, | ||
out[y * nx + x], in[x * nx + y]); | ||
return -1; | ||
} | ||
} | ||
} | ||
|
||
printf("All Correct!\n"); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) | ||
target_include_directories(main PUBLIC ../../include) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
#include "helper_cuda.h" | ||
#include <vector> | ||
#include "CudaAllocator.h" | ||
#include "ticktock.h" | ||
|
||
template <class T> | ||
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) { | ||
int x = blockIdx.x * blockDim.x + threadIdx.x; | ||
int y = blockIdx.y * blockDim.y + threadIdx.y; | ||
if (x >= nx || y >= ny) return; | ||
out[y * nx + x] = in[x * nx + y]; | ||
} | ||
|
||
int main() { | ||
int nx = 1<<14, ny = 1<<14; | ||
std::vector<int, CudaAllocator<int>> in(nx * ny); | ||
std::vector<int, CudaAllocator<int>> out(nx * ny); | ||
|
||
for (int i = 0; i < nx * ny; i++) { | ||
in[i] = i; | ||
} | ||
|
||
TICK(parallel_transpose); | ||
parallel_transpose<<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>> | ||
(out.data(), in.data(), nx, ny); | ||
checkCudaErrors(cudaDeviceSynchronize()); | ||
TOCK(parallel_transpose); | ||
|
||
for (int y = 0; y < ny; y++) { | ||
for (int x = 0; x < nx; x++) { | ||
if (out[y * nx + x] != in[x * nx + y]) { | ||
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y, | ||
out[y * nx + x], in[x * nx + y]); | ||
return -1; | ||
} | ||
} | ||
} | ||
|
||
printf("All Correct!\n"); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) | ||
target_include_directories(main PUBLIC ../../include) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
#include "helper_cuda.h" | ||
#include <vector> | ||
#include "CudaAllocator.h" | ||
#include "ticktock.h" | ||
|
||
template <int blockSize, class T> | ||
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) { | ||
int x = blockIdx.x * blockSize + threadIdx.x; | ||
int y = blockIdx.y * blockSize + threadIdx.y; | ||
if (x >= nx || y >= ny) return; | ||
__shared__ T tmp[blockSize * blockSize]; | ||
int rx = blockIdx.y * blockSize + threadIdx.x; | ||
int ry = blockIdx.x * blockSize + threadIdx.y; | ||
tmp[threadIdx.y * blockSize + threadIdx.x] = in[ry * nx + rx]; | ||
__syncthreads(); | ||
out[y * nx + x] = tmp[threadIdx.x * blockSize + threadIdx.y]; | ||
} | ||
|
||
int main() { | ||
int nx = 1<<14, ny = 1<<14; | ||
std::vector<int, CudaAllocator<int>> in(nx * ny); | ||
std::vector<int, CudaAllocator<int>> out(nx * ny); | ||
|
||
for (int i = 0; i < nx * ny; i++) { | ||
in[i] = i; | ||
} | ||
|
||
TICK(parallel_transpose); | ||
parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>> | ||
(out.data(), in.data(), nx, ny); | ||
checkCudaErrors(cudaDeviceSynchronize()); | ||
TOCK(parallel_transpose); | ||
|
||
for (int y = 0; y < ny; y++) { | ||
for (int x = 0; x < nx; x++) { | ||
if (out[y * nx + x] != in[x * nx + y]) { | ||
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y, | ||
out[y * nx + x], in[x * nx + y]); | ||
return -1; | ||
} | ||
} | ||
} | ||
|
||
printf("All Correct!\n"); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_BUILD_TYPE Release) | ||
|
||
project(hellocuda LANGUAGES CXX CUDA) | ||
|
||
add_executable(main main.cu) | ||
target_include_directories(main PUBLIC ../../include) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>) | ||
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#include <cstdio> | ||
#include <cuda_runtime.h> | ||
#include "helper_cuda.h" | ||
#include <vector> | ||
#include "CudaAllocator.h" | ||
#include "ticktock.h" | ||
|
||
template <int blockSize, class T> | ||
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) { | ||
int x = blockIdx.x * blockSize + threadIdx.x; | ||
int y = blockIdx.y * blockSize + threadIdx.y; | ||
if (x >= nx || y >= ny) return; | ||
__shared__ T tmp[(blockSize + 1) * blockSize]; | ||
int rx = blockIdx.y * blockSize + threadIdx.x; | ||
int ry = blockIdx.x * blockSize + threadIdx.y; | ||
tmp[threadIdx.y * (blockSize + 1) + threadIdx.x] = in[ry * nx + rx]; | ||
__syncthreads(); | ||
out[y * nx + x] = tmp[threadIdx.x * (blockSize + 1) + threadIdx.y]; | ||
} | ||
|
||
int main() { | ||
int nx = 1<<14, ny = 1<<14; | ||
std::vector<int, CudaAllocator<int>> in(nx * ny); | ||
std::vector<int, CudaAllocator<int>> out(nx * ny); | ||
|
||
for (int i = 0; i < nx * ny; i++) { | ||
in[i] = i; | ||
} | ||
|
||
TICK(parallel_transpose); | ||
parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>> | ||
(out.data(), in.data(), nx, ny); | ||
checkCudaErrors(cudaDeviceSynchronize()); | ||
TOCK(parallel_transpose); | ||
|
||
for (int y = 0; y < ny; y++) { | ||
for (int x = 0; x < nx; x++) { | ||
if (out[y * nx + x] != in[x * nx + y]) { | ||
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y, | ||
out[y * nx + x], in[x * nx + y]); | ||
return -1; | ||
} | ||
} | ||
} | ||
|
||
printf("All Correct!\n"); | ||
return 0; | ||
} |
Binary file not shown.