forked from JuliaGPU/CUDA.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpuarrays.jl
61 lines (40 loc) · 1.69 KB
/
gpuarrays.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# GPUArrays.jl interface
#
# Device functionality
#
## execution
struct CuArrayBackend <: AbstractGPUBackend end
struct CuKernelContext <: AbstractKernelContext end
@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
elements::Int, elements_per_thread::Int) where {F,N}
kernel = @cuda launch=false f(CuKernelContext(), args...)
# launching many large blocks) lowers performance, as observed with broadcast, so cap
# the block size if we don't have a grid-stride kernel (which would keep the grid small)
if elements_per_thread > 1
launch_configuration(kernel.fun)
else
launch_configuration(kernel.fun; max_threads=256)
end
end
@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int,
blocks::Int; name::Union{String,Nothing}) where {F,TT}
@cuda threads=threads blocks=blocks name=name f(CuKernelContext(), args...)
end
## on-device
# indexing
GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x
GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x
GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x
GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x
# memory
@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
) where {T, dims, id}
ptr = CUDA._shmem(Val(id), T, Val(prod(dims)))
CuDeviceArray(dims, reinterpret(LLVMPtr{T, AS.Shared}, ptr))
end
# synchronization
@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads()
#
# Host abstractions
#
GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()