Skip to content

Commit

Permalink
t
Browse files Browse the repository at this point in the history
  • Loading branch information
archibate committed Jan 27, 2022
1 parent f41a092 commit d4fcdfd
Show file tree
Hide file tree
Showing 14 changed files with 356 additions and 4 deletions.
10 changes: 6 additions & 4 deletions 08/01_thread/07/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
#include <cuda_runtime.h>

__global__ void kernel() {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int tnum = blockDim.x * gridDim.x;
printf("Flattened Thread %d of %d\n", tid, tnum);
printf("Block (%d,%d,%d) of (%d,%d,%d), Thread (%d,%d,%d) of (%d,%d,%d)\n",
blockIdx.x, blockIdx.y, blockIdx.z,
gridDim.x, gridDim.y, gridDim.z,
threadIdx.x, threadIdx.y, threadIdx.z,
blockDim.x, blockDim.y, blockDim.z);
}

int main() {
kernel<<<2, 3>>>();
kernel<<<dim3(2, 1, 1), dim3(2, 2, 2)>>>();
cudaDeviceSynchronize();
return 0;
}
8 changes: 8 additions & 0 deletions 08/01_thread/08/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
16 changes: 16 additions & 0 deletions 08/01_thread/08/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include <cstdio>
#include <cuda_runtime.h>

__global__ void kernel() {
printf("Block (%d,%d) of (%d,%d), Thread (%d,%d) of (%d,%d)\n",
blockIdx.x, blockIdx.y,
gridDim.x, gridDim.y,
threadIdx.x, threadIdx.y,
blockDim.x, blockDim.y);
}

int main() {
kernel<<<dim3(2, 1, 1), dim3(3, 2, 1)>>>();
cudaDeviceSynchronize();
return 0;
}
11 changes: 11 additions & 0 deletions 08/08_block/10/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
88 changes: 88 additions & 0 deletions 08/08_block/10/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

template <int blockSize, class T>
__global__ void parallel_sum_kernel(T *sum, T const *arr, int n) {
__shared__ volatile int local_sum[blockSize];
int j = threadIdx.x;
int i = blockIdx.x;
T temp_sum = 0;
for (int t = i * blockSize + j; t < n; t += blockSize * gridDim.x) {
temp_sum += arr[t];
}
local_sum[j] = temp_sum;
__syncthreads();
if constexpr (blockSize >= 1024) {
if (j < 512)
local_sum[j] += local_sum[j + 512];
__syncthreads();
}
if constexpr (blockSize >= 512) {
if (j < 256)
local_sum[j] += local_sum[j + 256];
__syncthreads();
}
if constexpr (blockSize >= 256) {
if (j < 128)
local_sum[j] += local_sum[j + 128];
__syncthreads();
}
if constexpr (blockSize >= 128) {
if (j < 64)
local_sum[j] += local_sum[j + 64];
__syncthreads();
}
if (j < 32) {
if constexpr (blockSize >= 64)
local_sum[j] += local_sum[j + 32];
if constexpr (blockSize >= 32)
local_sum[j] += local_sum[j + 16];
if constexpr (blockSize >= 16)
local_sum[j] += local_sum[j + 8];
if constexpr (blockSize >= 8)
local_sum[j] += local_sum[j + 4];
if constexpr (blockSize >= 4)
local_sum[j] += local_sum[j + 2];
if (j == 0) {
sum[i] = local_sum[0] + local_sum[1];
}
}
}

template <int reduceScale = 4096, int blockSize = 256, int cutoffSize = reduceScale * 2, class T>
int parallel_sum(T const *arr, int n) {
if (n > cutoffSize) {
std::vector<int, CudaAllocator<int>> sum(n / reduceScale);
parallel_sum_kernel<blockSize><<<n / reduceScale, blockSize>>>(sum.data(), arr, n);
return parallel_sum(sum.data(), n / reduceScale);
} else {
checkCudaErrors(cudaDeviceSynchronize());
T final_sum = 0;
for (int i = 0; i < n; i++) {
final_sum += arr[i];
}
return final_sum;
}
}

int main() {
int n = 1<<24;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> sum(n / 4096);

for (int i = 0; i < n; i++) {
arr[i] = std::rand() % 4;
}

TICK(parallel_sum);
int final_sum = parallel_sum(arr.data(), n);
TOCK(parallel_sum);

printf("result: %d\n", final_sum);

return 0;
}
11 changes: 11 additions & 0 deletions 08/09_ndarray/01/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
44 changes: 44 additions & 0 deletions 08/09_ndarray/01/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

template <class T>
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
int linearized = blockIdx.x * blockDim.x + threadIdx.x;
int y = linearized / nx;
int x = linearized % nx;
if (x >= nx || y >= ny) return;
out[y * nx + x] = in[x * nx + y];
}

int main() {
int nx = 1<<14, ny = 1<<14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);

for (int i = 0; i < nx * ny; i++) {
in[i] = i;
}

TICK(parallel_transpose);
parallel_transpose<<<nx * ny / 1024, 1024>>>
(out.data(), in.data(), nx, ny);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_transpose);

for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out[y * nx + x] != in[x * nx + y]) {
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
out[y * nx + x], in[x * nx + y]);
return -1;
}
}
}

printf("All Correct!\n");
return 0;
}
11 changes: 11 additions & 0 deletions 08/09_ndarray/02/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
43 changes: 43 additions & 0 deletions 08/09_ndarray/02/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

template <class T>
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= nx || y >= ny) return;
out[y * nx + x] = in[x * nx + y];
}

int main() {
int nx = 1<<14, ny = 1<<14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);

for (int i = 0; i < nx * ny; i++) {
in[i] = i;
}

TICK(parallel_transpose);
parallel_transpose<<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
(out.data(), in.data(), nx, ny);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_transpose);

for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out[y * nx + x] != in[x * nx + y]) {
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
out[y * nx + x], in[x * nx + y]);
return -1;
}
}
}

printf("All Correct!\n");
return 0;
}
11 changes: 11 additions & 0 deletions 08/09_ndarray/03/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
48 changes: 48 additions & 0 deletions 08/09_ndarray/03/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

template <int blockSize, class T>
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
int x = blockIdx.x * blockSize + threadIdx.x;
int y = blockIdx.y * blockSize + threadIdx.y;
if (x >= nx || y >= ny) return;
__shared__ T tmp[blockSize * blockSize];
int rx = blockIdx.y * blockSize + threadIdx.x;
int ry = blockIdx.x * blockSize + threadIdx.y;
tmp[threadIdx.y * blockSize + threadIdx.x] = in[ry * nx + rx];
__syncthreads();
out[y * nx + x] = tmp[threadIdx.x * blockSize + threadIdx.y];
}

int main() {
int nx = 1<<14, ny = 1<<14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);

for (int i = 0; i < nx * ny; i++) {
in[i] = i;
}

TICK(parallel_transpose);
parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
(out.data(), in.data(), nx, ny);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_transpose);

for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out[y * nx + x] != in[x * nx + y]) {
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
out[y * nx + x], in[x * nx + y]);
return -1;
}
}
}

printf("All Correct!\n");
return 0;
}
11 changes: 11 additions & 0 deletions 08/10_stencil/01/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
48 changes: 48 additions & 0 deletions 08/10_stencil/01/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

template <int blockSize, class T>
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
int x = blockIdx.x * blockSize + threadIdx.x;
int y = blockIdx.y * blockSize + threadIdx.y;
if (x >= nx || y >= ny) return;
__shared__ T tmp[(blockSize + 1) * blockSize];
int rx = blockIdx.y * blockSize + threadIdx.x;
int ry = blockIdx.x * blockSize + threadIdx.y;
tmp[threadIdx.y * (blockSize + 1) + threadIdx.x] = in[ry * nx + rx];
__syncthreads();
out[y * nx + x] = tmp[threadIdx.x * (blockSize + 1) + threadIdx.y];
}

int main() {
int nx = 1<<14, ny = 1<<14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);

for (int i = 0; i < nx * ny; i++) {
in[i] = i;
}

TICK(parallel_transpose);
parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
(out.data(), in.data(), nx, ny);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_transpose);

for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out[y * nx + x] != in[x * nx + y]) {
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
out[y * nx + x], in[x * nx + y]);
return -1;
}
}
}

printf("All Correct!\n");
return 0;
}
Binary file modified 08/slides.pptx
Binary file not shown.

0 comments on commit d4fcdfd

Please sign in to comment.