forked from JuliaGPU/CUDA.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinalg.jl
132 lines (105 loc) · 4.66 KB
/
linalg.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# integration with LinearAlgebra.jl
CuMatOrAdj{T} = Union{CuMatrix, LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, LinearAlgebra.Transpose{T, <:CuMatrix{T}}}
CuOrAdj{T} = Union{CuVecOrMat, LinearAlgebra.Adjoint{T, <:CuVecOrMat{T}}, LinearAlgebra.Transpose{T, <:CuVecOrMat{T}}}
# matrix division
function Base.:\(_A::CuMatOrAdj, _B::CuOrAdj)
A, B = copy(_A), copy(_B)
A, ipiv = CUSOLVER.getrf!(A)
return CUSOLVER.getrs!('N', A, ipiv, B)
end
# patch JuliaLang/julia#40899 to create a CuArray
# (see https://github.com/JuliaLang/julia/pull/41331#issuecomment-868374522)
if VERSION >= v"1.7-"
_zeros(::Type{T}, b::AbstractVector, n::Integer) where {T} = CUDA.zeros(T, max(length(b), n))
_zeros(::Type{T}, B::AbstractMatrix, n::Integer) where {T} = CUDA.zeros(T, max(size(B, 1), n), size(B, 2))
function Base.:\(F::Union{LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray},
Adjoint{<:Any,<:LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray}}},
B::AbstractVecOrMat)
m, n = size(F)
if m != size(B, 1)
throw(DimensionMismatch("arguments must have the same number of rows"))
end
TFB = typeof(oneunit(eltype(B)) / oneunit(eltype(F)))
FF = Factorization{TFB}(F)
# For wide problem we (often) compute a minimum norm solution. The solution
# is larger than the right hand side so we use size(F, 2).
BB = _zeros(TFB, B, n)
if n > size(B, 1)
# Underdetermined
copyto!(view(BB, 1:m, :), B)
else
copyto!(BB, B)
end
ldiv!(FF, BB)
# For tall problems, we compute a least squares solution so only part
# of the rhs should be returned from \ while ldiv! uses (and returns)
# the complete rhs
return LinearAlgebra._cut_B(BB, 1:n)
end
end
# qr
using LinearAlgebra: AbstractQ
# AbstractQ's `size` is the size of the full matrix,
# while `Matrix(Q)` only gives the compact Q.
# See JuliaLang/julia#26591 and JuliaGPU/CUDA.jl#969.
CuMatrix{T}(Q::AbstractQ{S}) where {T,S} = convert(CuArray, Matrix{T}(Q))
CuMatrix(Q::AbstractQ{T}) where {T} = CuMatrix{T}(Q)
CuArray{T}(Q::AbstractQ) where {T} = CuMatrix{T}(Q)
CuArray(Q::AbstractQ) = CuMatrix(Q)
# dot
function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2}
n = length(x)
n==length(y) || throw(DimensionMismatch("dot product arguments have lengths $(length(x)) and $(length(y))"))
# custom kernel using simple linear indexing and atomic additions,
# resulting in about 10% speed-up compared to a simple mapreduce.
function kernel(x, y, res::AbstractArray{T}, shuffle) where {T}
local_val = zero(T)
# grid-stride loop
i = threadIdx().x + (blockIdx().x - 1i32)*blockDim().x
while i <= length(x)
@inbounds local_val += dot(x[i], y[i])
i += blockDim().x * gridDim().x
end
val = reduce_block(+, local_val, zero(T), shuffle)
if threadIdx().x == 1i32
# NOTE: introduces nondeterminism
@inbounds @atomic res[] += val
end
return
end
dev = device()
let T = promote_type(T1, T2)
# only use the above kernel if we don't care about determinism
# and if atomic operations are supported on these inputs
atomic = T <: Union{Int16, Int32, Int64, Float16, Float32, Float64}
if math_mode() == PEDANTIC_MATH || !atomic
return mapreduce((x,y)->dot(x, y), +, x, y)
end
res = zeros(T, 1)
# be conservative about using shuffle instructions
shuffle = T <: Union{Bool,
UInt8, UInt16, UInt32, UInt64, UInt128,
Int8, Int16, Int32, Int64, Int128,
Float16, Float32, Float64,
ComplexF16, ComplexF32, ComplexF64}
# how many threads do we want?
# reduce_block(shuffle=true) requires the block to consist of full warps.
wanted_threads = shuffle ? nextwarp(dev, n) : n
function compute_threads(max_threads)
if wanted_threads > max_threads
shuffle ? prevwarp(dev, max_threads) : max_threads
else
wanted_threads
end
end
# how many threads can we launch?
kernel = @cuda launch=false kernel(x, y, res, Val(shuffle))
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads)
threads = compute_threads(config.threads)
blocks = min(config.blocks, cld(n, config.blocks))
shmem = compute_shmem(threads)
kernel(x, y, res, Val(shuffle); threads, blocks, shmem)
@allowscalar res[]
end
end