Skip to content

Commit

Permalink
t
Browse files Browse the repository at this point in the history
  • Loading branch information
archibate committed Jan 26, 2022
1 parent f756fcd commit 0832c0a
Show file tree
Hide file tree
Showing 10 changed files with 263 additions and 0 deletions.
56 changes: 56 additions & 0 deletions 08/07_atomic/04/a.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-30672275
// Cuda compilation tools, release 11.5, V11.5.119
// Based on NVVM 7.0.1
//

.version 7.5
.target sm_52
.address_size 64

// .globl _Z12parallel_sumPiPKii

.visible .entry _Z12parallel_sumPiPKii(
.param .u64 _Z12parallel_sumPiPKii_param_0,
.param .u64 _Z12parallel_sumPiPKii_param_1,
.param .u32 _Z12parallel_sumPiPKii_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<20>;
.reg .b64 %rd<7>;


ld.param.u64 %rd2, [_Z12parallel_sumPiPKii_param_0];
ld.param.u64 %rd3, [_Z12parallel_sumPiPKii_param_1];
ld.param.u32 %r9, [_Z12parallel_sumPiPKii_param_2];
mov.u32 %r11, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r17, %r1, %r11, %r12;
setp.ge.s32 %p1, %r17, %r9;
mov.u32 %r19, 0;
@%p1 bra $L__BB0_3;

cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r14, %nctaid.x;
mul.lo.s32 %r3, %r1, %r14;

$L__BB0_2:
mul.wide.s32 %rd4, %r17, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.u32 %r15, [%rd5];
add.s32 %r19, %r15, %r19;
add.s32 %r17, %r17, %r3;
setp.lt.s32 %p2, %r17, %r9;
@%p2 bra $L__BB0_2;

$L__BB0_3:
cvta.to.global.u64 %rd6, %rd2;
atom.global.add.u32 %r16, [%rd6], %r19;
ret;

}

11 changes: 11 additions & 0 deletions 08/07_atomic/05/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
42 changes: 42 additions & 0 deletions 08/07_atomic/05/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

__global__ void parallel_filter(int *sum, int *res, int const *arr, int n) {
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
if (arr[i] >= 2) {
int loc = atomicAdd(&sum[0], 1);
res[loc] = arr[i];
}
}
}

int main() {
int n = 1<<24;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> sum(1);
std::vector<int, CudaAllocator<int>> res(n);

for (int i = 0; i < n; i++) {
arr[i] = std::rand() % 4;
}

TICK(parallel_filter);
parallel_filter<<<n / 4096, 512>>>(sum.data(), res.data(), arr.data(), n);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_filter);

for (int i = 0; i < sum[0]; i++) {
if (res[i] < 2) {
printf("Wrong At %d\n", i);
return -1;
}
}

printf("All Correct!\n");
return 0;
}
11 changes: 11 additions & 0 deletions 08/07_atomic/06/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
34 changes: 34 additions & 0 deletions 08/07_atomic/06/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

__global__ void parallel_sum(int *sum, int const *arr, int n) {
int local_sum = 0;
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
local_sum = std::max(local_sum, arr[i]);
}
atomicMax(&sum[0], local_sum);
}

int main() {
int n = 1<<24;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> sum(1);

for (int i = 0; i < n; i++) {
arr[i] = std::rand() % 4;
}

TICK(parallel_sum);
parallel_sum<<<n / 4096, 512>>>(sum.data(), arr.data(), n);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_sum);

printf("result: %d\n", sum[0]);

return 0;
}
11 changes: 11 additions & 0 deletions 08/07_atomic/07/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
43 changes: 43 additions & 0 deletions 08/07_atomic/07/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

__device__ __inline__ int my_atomic_add(int *dst, int src) {
int old = *dst, expect;
do {
expect = old;
old = atomicCAS(dst, expect, expect + src);
} while (expect != old);
return old;
}

__global__ void parallel_sum(int *sum, int const *arr, int n) {
int local_sum = 0;
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
local_sum += arr[i];
}
my_atomic_add(&sum[0], local_sum);
}

int main() {
int n = 65536;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> sum(1);

for (int i = 0; i < n; i++) {
arr[i] = std::rand() % 4;
}

TICK(parallel_sum);
parallel_sum<<<n / 4096, 512>>>(sum.data(), arr.data(), n);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_sum);

printf("result: %d\n", sum[0]);

return 0;
}
11 changes: 11 additions & 0 deletions 08/07_atomic/08/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
44 changes: 44 additions & 0 deletions 08/07_atomic/08/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <cstdio>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"

__device__ __inline__ int float_atomic_add(float *dst, float src) {
int old = __float_as_int(*dst), expect;
do {
expect = old;
old = atomicCAS((int *)dst, expect,
__float_as_int(__int_as_float(expect) + src));
} while (expect != old);
return old;
}

__global__ void parallel_sum(float *sum, float const *arr, int n) {
float local_sum = 0;
for (int i = blockDim.x * blockIdx.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
local_sum += arr[i];
}
float_atomic_add(&sum[0], local_sum);
}

int main() {
int n = 65536;
std::vector<float, CudaAllocator<float>> arr(n);
std::vector<float, CudaAllocator<float>> sum(1);

for (int i = 0; i < n; i++) {
arr[i] = std::rand() % 4;
}

TICK(parallel_sum);
parallel_sum<<<n / 4096, 512>>>(sum.data(), arr.data(), n);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_sum);

printf("result: %f\n", sum[0]);

return 0;
}
Binary file modified 08/slides.pptx
Binary file not shown.

0 comments on commit 0832c0a

Please sign in to comment.