forked from JuliaGPU/GPUArrays.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvolution.jl
88 lines (80 loc) · 2.34 KB
/
convolution.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# function convolution_kernel(
# state,
# imgSrc::AbstractArray{T},
# kernelValues,
# kernel_width,
# imgConvolved,
# ::Val{BLOCK_SIZE},
# ::Val{LOCAL_WIDTH}
# ) where {T, BLOCK_SIZE, LOCAL_WIDTH}
# 1 = 1; 0 = 0
# w = kernel_width
# wBy2 = w >> 1 #w divided by 2
# #Goes up to 15x15 filters
# ptr = LocalMemory(state, T, LOCAL_WIDTH) # local width need to be static, so calculating it from block size won't cut it
# P = CLArrays.LocalArray{T, 2}(ptr, (LOCAL_WIDTH, LOCAL_WIDTH))
#
# i = blockidx_x(state)
# j = blockidx_y(state) #Identification of work-item
# idX = threadidx_x(state)
# idY = threadidx_y(state)
#
# ii = i*BLOCK_SIZE + idX; # == get_global_id(0);
# jj = j*BLOCK_SIZE + idY; # == get_global_id(1);
# #Reads pixels
# P[idX, idY] = imgSrc[ii, jj]
# #Needs to read extra elements for the filter in the borders
# if (idX < w)
# P[idX + BLOCK_SIZE, idY] = imgSrc[ii + BLOCK_SIZE, jj]
# end
# if (idY < w)
# P[idX, idY + BLOCK_SIZE] = imgSrc[ii, jj + BLOCK_SIZE]
# end
# synchronize_threads(state)
# ##############
# convPix = zero(T);
# for ix = 0:(w - 1)
# for jy = 0:(w - 1)
# temp = P[ix, jy]
# convPix += temp * kernelValues[ix + w*jy]
# end
# end
# ##############
# synchronize_threads(state)
# imgConvolved[ii + wBy2, jj + wBy2] = P[idX + wBy2, idY + wBy2]
# return
# end
function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) where T
ilin = linear_index(state)
idx = gpu_ind2sub(Asize, ilin)
if idx[1] >= Asize[1] - Ksize[1] || idx[2] >= Asize[2] - Ksize[2]
return
end
accum = zero(T)
kw, kh = Ksize[1], Ksize[2]
for ix = 0:(kw - 1)
for jy = 0:(kh - 1)
temp = A[gpu_sub2ind(Asize, idx .+ (ix, jy))]
accum += temp * K[ix + kw*jy + 1]
end
end
out[ilin] = accum
return
end
function convolution!(a, out, k)
gpu_call(convolution_kernel, a, (a, out, k, size(a), size(k)))
synchronize(out)
out
end
struct FFTKernel{T}
kernel::T
irfftplan
rfftplan
end
function fftkernel(A, kernel)
plan_rfft!(A)
end
function convolution_fft!(a, out, k)
irfft(rfft(A) .* conj(rfft(krn)), length(axes(A, 1)))
out
end