t

GnibChen · Jan 27, 2022 · d4fcdfd · d4fcdfd
1 parent f41a092
commit d4fcdfd
Show file tree

Hide file tree

Showing 14 changed files with 356 additions and 4 deletions.
diff --git a/08/01_thread/07/main.cu b/08/01_thread/07/main.cu
@@ -2,13 +2,15 @@
 #include <cuda_runtime.h>
 
 __global__ void kernel() {
-    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    unsigned int tnum = blockDim.x * gridDim.x;
-    printf("Flattened Thread %d of %d\n", tid, tnum);
+    printf("Block (%d,%d,%d) of (%d,%d,%d), Thread (%d,%d,%d) of (%d,%d,%d)\n",
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           gridDim.x, gridDim.y, gridDim.z,
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockDim.x, blockDim.y, blockDim.z);
 }
 
 int main() {
-    kernel<<<2, 3>>>();
+    kernel<<<dim3(2, 1, 1), dim3(2, 2, 2)>>>();
     cudaDeviceSynchronize();
     return 0;
 }
diff --git a/08/01_thread/08/CMakeLists.txt b/08/01_thread/08/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/01_thread/08/main.cu b/08/01_thread/08/main.cu
@@ -0,0 +1,16 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void kernel() {
+    printf("Block (%d,%d) of (%d,%d), Thread (%d,%d) of (%d,%d)\n",
+           blockIdx.x, blockIdx.y,
+           gridDim.x, gridDim.y,
+           threadIdx.x, threadIdx.y,
+           blockDim.x, blockDim.y);
+}
+
+int main() {
+    kernel<<<dim3(2, 1, 1), dim3(3, 2, 1)>>>();
+    cudaDeviceSynchronize();
+    return 0;
+}
diff --git a/08/08_block/10/CMakeLists.txt b/08/08_block/10/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/08_block/10/main.cu b/08/08_block/10/main.cu
@@ -0,0 +1,88 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <int blockSize, class T>
+__global__ void parallel_sum_kernel(T *sum, T const *arr, int n) {
+    __shared__ volatile int local_sum[blockSize];
+    int j = threadIdx.x;
+    int i = blockIdx.x;
+    T temp_sum = 0;
+    for (int t = i * blockSize + j; t < n; t += blockSize * gridDim.x) {
+        temp_sum += arr[t];
+    }
+    local_sum[j] = temp_sum;
+    __syncthreads();
+    if constexpr (blockSize >= 1024) {
+        if (j < 512)
+            local_sum[j] += local_sum[j + 512];
+        __syncthreads();
+    }
+    if constexpr (blockSize >= 512) {
+        if (j < 256)
+            local_sum[j] += local_sum[j + 256];
+        __syncthreads();
+    }
+    if constexpr (blockSize >= 256) {
+        if (j < 128)
+            local_sum[j] += local_sum[j + 128];
+        __syncthreads();
+    }
+    if constexpr (blockSize >= 128) {
+        if (j < 64)
+            local_sum[j] += local_sum[j + 64];
+        __syncthreads();
+    }
+    if (j < 32) {
+        if constexpr (blockSize >= 64)
+            local_sum[j] += local_sum[j + 32];
+        if constexpr (blockSize >= 32)
+            local_sum[j] += local_sum[j + 16];
+        if constexpr (blockSize >= 16)
+            local_sum[j] += local_sum[j + 8];
+        if constexpr (blockSize >= 8)
+            local_sum[j] += local_sum[j + 4];
+        if constexpr (blockSize >= 4)
+            local_sum[j] += local_sum[j + 2];
+        if (j == 0) {
+            sum[i] = local_sum[0] + local_sum[1];
+        }
+    }
+}
+
+template <int reduceScale = 4096, int blockSize = 256, int cutoffSize = reduceScale * 2, class T>
+int parallel_sum(T const *arr, int n) {
+    if (n > cutoffSize) {
+        std::vector<int, CudaAllocator<int>> sum(n / reduceScale);
+        parallel_sum_kernel<blockSize><<<n / reduceScale, blockSize>>>(sum.data(), arr, n);
+        return parallel_sum(sum.data(), n / reduceScale);
+    } else {
+        checkCudaErrors(cudaDeviceSynchronize());
+        T final_sum = 0;
+        for (int i = 0; i < n; i++) {
+            final_sum += arr[i];
+        }
+        return final_sum;
+    }
+}
+
+int main() {
+    int n = 1<<24;
+    std::vector<int, CudaAllocator<int>> arr(n);
+    std::vector<int, CudaAllocator<int>> sum(n / 4096);
+
+    for (int i = 0; i < n; i++) {
+        arr[i] = std::rand() % 4;
+    }
+
+    TICK(parallel_sum);
+    int final_sum = parallel_sum(arr.data(), n);
+    TOCK(parallel_sum);
+
+    printf("result: %d\n", final_sum);
+
+    return 0;
+}
diff --git a/08/09_ndarray/01/CMakeLists.txt b/08/09_ndarray/01/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/09_ndarray/01/main.cu b/08/09_ndarray/01/main.cu
@@ -0,0 +1,44 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <class T>
+__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
+    int linearized = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = linearized / nx;
+    int x = linearized % nx;
+    if (x >= nx || y >= ny) return;
+    out[y * nx + x] = in[x * nx + y];
+}
+
+int main() {
+    int nx = 1<<14, ny = 1<<14;
+    std::vector<int, CudaAllocator<int>> in(nx * ny);
+    std::vector<int, CudaAllocator<int>> out(nx * ny);
+
+    for (int i = 0; i < nx * ny; i++) {
+        in[i] = i;
+    }
+
+    TICK(parallel_transpose);
+    parallel_transpose<<<nx * ny / 1024, 1024>>>
+        (out.data(), in.data(), nx, ny);
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(parallel_transpose);
+
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            if (out[y * nx + x] != in[x * nx + y]) {
+                printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
+                       out[y * nx + x], in[x * nx + y]);
+                return -1;
+            }
+        }
+    }
+
+    printf("All Correct!\n");
+    return 0;
+}
diff --git a/08/09_ndarray/02/CMakeLists.txt b/08/09_ndarray/02/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/09_ndarray/02/main.cu b/08/09_ndarray/02/main.cu
@@ -0,0 +1,43 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <class T>
+__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= nx || y >= ny) return;
+    out[y * nx + x] = in[x * nx + y];
+}
+
+int main() {
+    int nx = 1<<14, ny = 1<<14;
+    std::vector<int, CudaAllocator<int>> in(nx * ny);
+    std::vector<int, CudaAllocator<int>> out(nx * ny);
+
+    for (int i = 0; i < nx * ny; i++) {
+        in[i] = i;
+    }
+
+    TICK(parallel_transpose);
+    parallel_transpose<<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
+        (out.data(), in.data(), nx, ny);
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(parallel_transpose);
+
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            if (out[y * nx + x] != in[x * nx + y]) {
+                printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
+                       out[y * nx + x], in[x * nx + y]);
+                return -1;
+            }
+        }
+    }
+
+    printf("All Correct!\n");
+    return 0;
+}
diff --git a/08/09_ndarray/03/CMakeLists.txt b/08/09_ndarray/03/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/09_ndarray/03/main.cu b/08/09_ndarray/03/main.cu
@@ -0,0 +1,48 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <int blockSize, class T>
+__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
+    int x = blockIdx.x * blockSize + threadIdx.x;
+    int y = blockIdx.y * blockSize + threadIdx.y;
+    if (x >= nx || y >= ny) return;
+    __shared__ T tmp[blockSize * blockSize];
+    int rx = blockIdx.y * blockSize + threadIdx.x;
+    int ry = blockIdx.x * blockSize + threadIdx.y;
+    tmp[threadIdx.y * blockSize + threadIdx.x] = in[ry * nx + rx];
+    __syncthreads();
+    out[y * nx + x] = tmp[threadIdx.x * blockSize + threadIdx.y];
+}
+
+int main() {
+    int nx = 1<<14, ny = 1<<14;
+    std::vector<int, CudaAllocator<int>> in(nx * ny);
+    std::vector<int, CudaAllocator<int>> out(nx * ny);
+
+    for (int i = 0; i < nx * ny; i++) {
+        in[i] = i;
+    }
+
+    TICK(parallel_transpose);
+    parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
+        (out.data(), in.data(), nx, ny);
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(parallel_transpose);
+
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            if (out[y * nx + x] != in[x * nx + y]) {
+                printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
+                       out[y * nx + x], in[x * nx + y]);
+                return -1;
+            }
+        }
+    }
+
+    printf("All Correct!\n");
+    return 0;
+}
diff --git a/08/10_stencil/01/CMakeLists.txt b/08/10_stencil/01/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/10_stencil/01/main.cu b/08/10_stencil/01/main.cu
@@ -0,0 +1,48 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <int blockSize, class T>
+__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
+    int x = blockIdx.x * blockSize + threadIdx.x;
+    int y = blockIdx.y * blockSize + threadIdx.y;
+    if (x >= nx || y >= ny) return;
+    __shared__ T tmp[(blockSize + 1) * blockSize];
+    int rx = blockIdx.y * blockSize + threadIdx.x;
+    int ry = blockIdx.x * blockSize + threadIdx.y;
+    tmp[threadIdx.y * (blockSize + 1) + threadIdx.x] = in[ry * nx + rx];
+    __syncthreads();
+    out[y * nx + x] = tmp[threadIdx.x * (blockSize + 1) + threadIdx.y];
+}
+
+int main() {
+    int nx = 1<<14, ny = 1<<14;
+    std::vector<int, CudaAllocator<int>> in(nx * ny);
+    std::vector<int, CudaAllocator<int>> out(nx * ny);
+
+    for (int i = 0; i < nx * ny; i++) {
+        in[i] = i;
+    }
+
+    TICK(parallel_transpose);
+    parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
+        (out.data(), in.data(), nx, ny);
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(parallel_transpose);
+
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            if (out[y * nx + x] != in[x * nx + y]) {
+                printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
+                       out[y * nx + x], in[x * nx + y]);
+                return -1;
+            }
+        }
+    }
+
+    printf("All Correct!\n");
+    return 0;
+}
diff --git a/08/slides.pptx b/08/slides.pptx