tmp

GnibChen · Jan 24, 2022 · 55ca661 · 55ca661
1 parent 76d388a
commit 55ca661
Show file tree

Hide file tree

Showing 27 changed files with 463 additions and 0 deletions.
diff --git a/08/00_hello/04/CMakeLists.txt b/08/00_hello/04/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/04/main.cu b/08/00_hello/04/main.cu
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__device__ void say_hello() {
+    printf("Hello, world from GPU!\n");
+}
+
+__host__ void say_hello_host() {
+    printf("Hello, world from CPU!\n");
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello_host();
+    return 0;
+}
diff --git a/08/00_hello/05/CMakeLists.txt b/08/00_hello/05/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/05/main.cu b/08/00_hello/05/main.cu
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__device__ void say_hello() {
+    printf("Hello, world from GPU!\n");
+}
+
+void say_hello_host() {
+    printf("Hello, world from CPU!\n");
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello_host();
+    return 0;
+}
diff --git a/08/00_hello/06/CMakeLists.txt b/08/00_hello/06/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/06/main.cu b/08/00_hello/06/main.cu
@@ -0,0 +1,17 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__host__ __device__ void say_hello() {
+    printf("Hello, world!\n");
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello();
+    return 0;
+}
diff --git a/08/00_hello/07/CMakeLists.txt b/08/00_hello/07/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/00_hello/07/main.cu b/08/00_hello/07/main.cu
@@ -0,0 +1,17 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+constexpr const char *cuthead(const char *p) {
+    return p + 1;
+}
+
+__global__ void kernel() {
+    printf(cuthead("Gello, world!\n"));
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    printf(cuthead("Cello, world!\n"));
+    return 0;
+}
diff --git a/08/00_hello/08/CMakeLists.txt b/08/00_hello/08/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/08/main.cu b/08/00_hello/08/main.cu
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__host__ __device__ void say_hello() {
+#ifdef __CUDA_ARCH__
+    printf("Hello, world from GPU!\n");
+#else
+    printf("Hello, world from CPU!\n");
+#endif
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello();
+    return 0;
+}
diff --git a/08/00_hello/09/CMakeLists.txt b/08/00_hello/09/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/09/main.cu b/08/00_hello/09/main.cu
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__host__ __device__ void say_hello() {
+#ifdef __CUDA_ARCH__
+    printf("Hello, world from GPU architecture %d!\n", __CUDA_ARCH__);
+#else
+    printf("Hello, world from CPU!\n");
+#endif
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello();
+    return 0;
+}
diff --git a/08/00_hello/10/CMakeLists.txt b/08/00_hello/10/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CUDA_ARCHITECTURES 52;70;75;86)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
diff --git a/08/00_hello/10/main.cu b/08/00_hello/10/main.cu
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__host__ __device__ void say_hello() {
+#ifdef __CUDA_ARCH__
+    printf("Hello, world from GPU architecture %d!\n", __CUDA_ARCH__);
+#else
+    printf("Hello, world from CPU!\n");
+#endif
+}
+
+__global__ void kernel() {
+    say_hello();
+}
+
+int main() {
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    say_hello();
+    return 0;
+}
diff --git a/08/05_math/01/CMakeLists.txt b/08/05_math/01/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/05_math/01/main.cu b/08/05_math/01/main.cu
@@ -0,0 +1,29 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+
+template <class Func>
+__global__ void parallel_for(int n, Func func) {
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x;
+         i < n; i += blockDim.x * gridDim.x) {
+        func(i);
+    }
+}
+
+int main() {
+    int n = 65536;
+    std::vector<float, CudaAllocator<float>> arr(n);
+
+    parallel_for<<<32, 128>>>(n, [arr = arr.data()] __device__ (int i) {
+        arr[i] = sinf(i);
+    });
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    for (int i = 0; i < n; i++) {
+        printf("diff %d = %f\n", i, arr[i] - sinf(i));
+    }
+
+    return 0;
+}
diff --git a/08/05_math/02/CMakeLists.txt b/08/05_math/02/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/05_math/02/main.cu b/08/05_math/02/main.cu
@@ -0,0 +1,39 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <class Func>
+__global__ void parallel_for(int n, Func func) {
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x;
+         i < n; i += blockDim.x * gridDim.x) {
+        func(i);
+    }
+}
+
+int main() {
+    int n = 1<<25;
+    std::vector<float, CudaAllocator<float>> gpu(n);
+    std::vector<float> cpu(n);
+
+    TICK(cpu_sinf);
+    for (int i = 0; i < n; i++) {
+        cpu[i] = sinf(i);
+    }
+    TOCK(cpu_sinf);
+
+    TICK(gpu_sinf);
+    parallel_for<<<n / 512, 128>>>(n, [gpu = gpu.data()] __device__ (int i) {
+        gpu[i] = sinf(i);
+    });
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(gpu_sinf);
+
+    //for (int i = 0; i < n; i++) {
+        //printf("diff %d = %f\n", i, gpu[i] - cpu[i]);
+    //}
+
+    return 0;
+}
diff --git a/08/05_math/03/CMakeLists.txt b/08/05_math/03/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
diff --git a/08/05_math/03/main.cu b/08/05_math/03/main.cu
@@ -0,0 +1,39 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include <vector>
+#include "CudaAllocator.h"
+#include "ticktock.h"
+
+template <class Func>
+__global__ void parallel_for(int n, Func func) {
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x;
+         i < n; i += blockDim.x * gridDim.x) {
+        func(i);
+    }
+}
+
+int main() {
+    int n = 1<<25;
+    std::vector<float, CudaAllocator<float>> gpu(n);
+    std::vector<float> cpu(n);
+
+    TICK(cpu_sinf);
+    for (int i = 0; i < n; i++) {
+        cpu[i] = sinf(i);
+    }
+    TOCK(cpu_sinf);
+
+    TICK(gpu_sinf);
+    parallel_for<<<n / 512, 128>>>(n, [gpu = gpu.data()] __device__ (int i) {
+        gpu[i] = __sinf(i);
+    });
+    checkCudaErrors(cudaDeviceSynchronize());
+    TOCK(gpu_sinf);
+
+    //for (int i = 0; i < n; i++) {
+        //printf("diff %d = %f\n", i, gpu[i] - cpu[i]);
+    //}
+
+    return 0;
+}
diff --git a/08/05_math/04/CMakeLists.txt b/08/05_math/04/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+project(hellocuda LANGUAGES CXX CUDA)
+
+add_executable(main main.cu)
+target_include_directories(main PUBLIC ../../include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)