forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
float16.jl
174 lines (155 loc) · 5.74 KB
/
float16.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# This file is a part of Julia. License is MIT: http://julialang.org/license
function convert(::Type{Float32}, val::Float16)
ival::UInt32 = reinterpret(UInt16, val)
sign::UInt32 = (ival & 0x8000) >> 15
exp::UInt32 = (ival & 0x7c00) >> 10
sig::UInt32 = (ival & 0x3ff) >> 0
ret::UInt32
if exp == 0
if sig == 0
sign = sign << 31
ret = sign | exp | sig
else
n_bit = 1
bit = 0x0200
while (bit & sig) == 0
n_bit = n_bit + 1
bit = bit >> 1
end
sign = sign << 31
exp = (-14 - n_bit + 127) << 23
sig = ((sig & (~bit)) << n_bit) << (23 - 10)
ret = sign | exp | sig
end
elseif exp == 0x1f
if sig == 0 # Inf
if sign == 0
ret = 0x7f800000
else
ret = 0xff800000
end
else # NaN
ret = 0x7fc00000 | (sign<<31)
end
else
sign = sign << 31
exp = (exp - 15 + 127) << 23
sig = sig << (23 - 10)
ret = sign | exp | sig
end
return reinterpret(Float32, ret)
end
# Float32 -> Float16 algorithm from:
# "Fast Half Float Conversion" by Jeroen van der Zijp
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
const basetable = Array{UInt16}(512)
const shifttable = Array{UInt8}(512)
for i = 0:255
e = i - 127
if e < -24 # Very small numbers map to zero
basetable[i|0x000+1] = 0x0000
basetable[i|0x100+1] = 0x8000
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
elseif e < -14 # Small numbers map to denorms
basetable[i|0x000+1] = (0x0400>>(-e-14))
basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
shifttable[i|0x000+1] = -e-1
shifttable[i|0x100+1] = -e-1
elseif e <= 15 # Normal numbers just lose precision
basetable[i|0x000+1] = ((e+15)<<10)
basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
elseif e < 128 # Large numbers map to Infinity
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 24
shifttable[i|0x100+1] = 24
else # Infinity and NaN's stay Infinity and NaN's
basetable[i|0x000+1] = 0x7C00
basetable[i|0x100+1] = 0xFC00
shifttable[i|0x000+1] = 13
shifttable[i|0x100+1] = 13
end
end
function convert(::Type{Float16}, val::Float32)
f = reinterpret(UInt32, val)
i = (f >> 23) & 0x1ff + 1
sh = shifttable[i]
f &= 0x007fffff
h::UInt16 = basetable[i] + (f >> sh)
# round
# NOTE: we maybe should ignore NaNs here, but the payload is
# getting truncated anyway so "rounding" it might not matter
nextbit = (f >> (sh-1)) & 1
if nextbit != 0
if h&1 == 1 || # round halfway to even
(f & ((1<<(sh-1))-1)) != 0 # check lower bits
h += 1
end
end
reinterpret(Float16, h)
end
convert(::Type{Bool}, x::Float16) = x==0 ? false : x==1 ? true : throw(InexactError())
convert(::Type{Int128}, x::Float16) = convert(Int128, Float32(x))
convert(::Type{UInt128}, x::Float16) = convert(UInt128, Float32(x))
convert{T<:Integer}(::Type{T}, x::Float16) = convert(T, Float32(x))
round{T<:Integer}(::Type{T}, x::Float16) = round(T, Float32(x))
trunc{T<:Integer}(::Type{T}, x::Float16) = trunc(T, Float32(x))
floor{T<:Integer}(::Type{T}, x::Float16) = floor(T, Float32(x))
ceil{ T<:Integer}(::Type{T}, x::Float16) = ceil(T, Float32(x))
round(x::Float16) = Float16(round(Float32(x)))
trunc(x::Float16) = Float16(trunc(Float32(x)))
floor(x::Float16) = Float16(floor(Float32(x)))
ceil(x::Float16) = Float16( ceil(Float32(x)))
isnan(x::Float16) = reinterpret(UInt16,x)&0x7fff > 0x7c00
isinf(x::Float16) = reinterpret(UInt16,x)&0x7fff == 0x7c00
isfinite(x::Float16) = reinterpret(UInt16,x)&0x7c00 != 0x7c00
function ==(x::Float16, y::Float16)
ix = reinterpret(UInt16,x)
iy = reinterpret(UInt16,y)
if (ix|iy)&0x7fff > 0x7c00 #isnan(x) || isnan(y)
return false
end
if (ix|iy)&0x7fff == 0x0000
return true
end
return ix == iy
end
-(x::Float16) = reinterpret(Float16, reinterpret(UInt16,x) $ 0x8000)
abs(x::Float16) = reinterpret(Float16, reinterpret(UInt16,x) & 0x7fff)
for op in (:+,:-,:*,:/,:\,:^)
@eval ($op)(a::Float16, b::Float16) = Float16(($op)(Float32(a), Float32(b)))
end
function fma(a::Float16, b::Float16, c::Float16)
Float16(fma(Float32(a), Float32(b), Float32(c)))
end
function muladd(a::Float16, b::Float16, c::Float16)
Float16(muladd(Float32(a), Float32(b), Float32(c)))
end
for op in (:<,:<=,:isless)
@eval ($op)(a::Float16, b::Float16) = ($op)(Float32(a), Float32(b))
end
for func in (:sin,:cos,:tan,:asin,:acos,:atan,:sinh,:cosh,:tanh,:asinh,:acosh,
:atanh,:exp,:log,:log2,:log10,:sqrt,:lgamma,:log1p,:erf,:erfc)
@eval begin
$func(a::Float16) = Float16($func(Float32(a)))
$func(a::Complex32) = Complex32($func(Complex64(a)))
end
end
for func in (:div,:fld,:cld,:rem,:mod,:atan2,:hypot)
@eval begin
$func(a::Float16,b::Float16) = Float16($func(Float32(a),Float32(b)))
end
end
ldexp(a::Float16, b::Integer) = Float16(ldexp(Float32(a), b))
^(x::Float16, y::Integer) = Float16(Float32(x)^y)
rationalize{T<:Integer}(::Type{T}, x::Float16; tol::Real=eps(x)) = rationalize(T, Float32(x); tol=tol)
reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16,x)
reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16,x)
sign_mask(::Type{Float16}) = 0x8000
exponent_mask(::Type{Float16}) = 0x7c00
exponent_one(::Type{Float16}) = 0x3c00
exponent_half(::Type{Float16}) = 0x3800
significand_mask(::Type{Float16}) = 0x03ff