forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchar.jl
316 lines (270 loc) · 11.3 KB
/
char.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# This file is a part of Julia. License is MIT: https://julialang.org/license
"""
The `AbstractChar` type is the supertype of all character implementations
in Julia. A character represents a Unicode code point, and can be converted
to an integer via the [`codepoint`](@ref) function in order to obtain the
numerical value of the code point, or constructed from the same integer.
These numerical values determine how characters are compared with `<` and `==`,
for example. New `T <: AbstractChar` types should define a `codepoint(::T)`
method and a `T(::UInt32)` constructor, at minimum.
A given `AbstractChar` subtype may be capable of representing only a subset
of Unicode, in which case conversion from an unsupported `UInt32` value
may throw an error. Conversely, the built-in [`Char`](@ref) type represents
a *superset* of Unicode (in order to losslessly encode invalid byte streams),
in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
The [`isvalid`](@ref) function can be used to check which codepoints are
representable in a given `AbstractChar` type.
Internally, an `AbstractChar` type may use a variety of encodings. Conversion
via `codepoint(char)` will not reveal this encoding because it always returns the
Unicode value of the character. `print(io, c)` of any `c::AbstractChar`
produces an encoding determined by `io` (UTF-8 for all built-in `IO`
types), via conversion to `Char` if necessary.
`write(io, c)`, in contrast, may emit an encoding depending on
`typeof(c)`, and `read(io, typeof(c))` should read the same encoding as `write`.
New `AbstractChar` types must provide their own implementations of
`write` and `read`.
"""
AbstractChar
"""
Char(c::Union{Number,AbstractChar})
`Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
of characters in Julia. `Char` is the type used for character literals like `'x'`
and it is also the element type of [`String`](@ref).
In order to losslessly represent arbitrary byte streams stored in a `String`,
a `Char` value may store information that cannot be converted to a Unicode
codepoint — converting such a `Char` to `UInt32` will throw an error.
The [`isvalid(c::Char)`](@ref) function can be used to query whether `c`
represents a valid Unicode character.
"""
Char
(::Type{T})(x::Number) where {T<:AbstractChar} = T(UInt32(x))
AbstractChar(x::Number) = Char(x)
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
(::Type{T})(x::T) where {T<:AbstractChar} = x
"""
ncodeunits(c::Char) -> Int
Return the number of code units required to encode a character as UTF-8.
This is the number of bytes which will be printed if the character is written
to an output stream, or `ncodeunits(string(c))` but computed efficiently.
!!! compat "Julia 1.1"
This method requires at least Julia 1.1. In Julia 1.0 consider
using `ncodeunits(string(c))`.
"""
ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient
"""
codepoint(c::AbstractChar) -> Integer
Return the Unicode codepoint (an unsigned integer) corresponding
to the character `c` (or throw an exception if `c` does not represent
a valid character). For `Char`, this is a `UInt32` value, but
`AbstractChar` types that represent only a subset of Unicode may
return a different-sized integer (e.g. `UInt8`).
"""
function codepoint end
codepoint(c::Char) = UInt32(c)
struct InvalidCharError{T<:AbstractChar} <: Exception
char::T
end
struct CodePointError{T<:Integer} <: Exception
code::T
end
@noinline invalid_char(c::AbstractChar) = throw(InvalidCharError(c))
@noinline code_point_err(u::Integer) = throw(CodePointError(u))
function ismalformed(c::Char)
u = reinterpret(UInt32, c)
l1 = leading_ones(u) << 3
t0 = trailing_zeros(u) & 56
(l1 == 8) | (l1 + t0 > 32) |
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
end
@inline is_overlong_enc(u::UInt32) = (u >> 24 == 0xc0) | (u >> 24 == 0xc1) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)
function isoverlong(c::Char)
u = reinterpret(UInt32, c)
is_overlong_enc(u)
end
# fallback: other AbstractChar types, by default, are assumed
# not to support malformed or overlong encodings.
"""
ismalformed(c::AbstractChar) -> Bool
Return `true` if `c` represents malformed (non-Unicode) data according to the
encoding used by `c`. Defaults to `false` for non-`Char` types. See also
[`show_invalid`](@ref).
"""
ismalformed(c::AbstractChar) = false
"""
isoverlong(c::AbstractChar) -> Bool
Return `true` if `c` represents an overlong UTF-8 sequence. Defaults
to `false` for non-`Char` types. See also [`decode_overlong`](@ref)
and [`show_invalid`](@ref).
"""
isoverlong(c::AbstractChar) = false
function UInt32(c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
u < 0x80000000 && return u >> 24
l1 = leading_ones(u)
t0 = trailing_zeros(u) & 56
(l1 == 1) | (8l1 + t0 > 32) |
((((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) | is_overlong_enc(u)) &&
invalid_char(c)::Union{}
u &= 0xffffffff >> l1
u >>= t0
((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) |
((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
end
"""
decode_overlong(c::AbstractChar) -> Integer
When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
the Unicode codepoint value of `c`. `AbstractChar` implementations
that support overlong encodings should implement `Base.decode_overlong`.
"""
function decode_overlong end
function decode_overlong(c::Char)
u = reinterpret(UInt32, c)
l1 = leading_ones(u)
t0 = trailing_zeros(u) & 56
u &= 0xffffffff >> l1
u >>= t0
((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) |
((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
end
function Char(u::UInt32)
u < 0x80 && return reinterpret(Char, u << 24)
u < 0x00200000 || code_point_err(u)::Union{}
c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
u < 0x00010000 ? (c << 08) | 0xe0808000 :
(c << 00) | 0xf0808080
reinterpret(Char, c)
end
function (T::Union{Type{Int8},Type{UInt8}})(c::Char)
i = reinterpret(Int32, c)
i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
end
function Char(b::Union{Int8,UInt8})
0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
end
convert(::Type{AbstractChar}, x::Number) = Char(x) # default to Char
convert(::Type{T}, x::Number) where {T<:AbstractChar} = T(x)
convert(::Type{T}, x::AbstractChar) where {T<:Number} = T(x)
convert(::Type{T}, c::AbstractChar) where {T<:AbstractChar} = T(c)
convert(::Type{T}, c::T) where {T<:AbstractChar} = c
rem(x::AbstractChar, ::Type{T}) where {T<:Number} = rem(codepoint(x), T)
typemax(::Type{Char}) = reinterpret(Char, typemax(UInt32))
typemin(::Type{Char}) = reinterpret(Char, typemin(UInt32))
size(c::AbstractChar) = ()
size(c::AbstractChar, d::Integer) = d < 1 ? throw(BoundsError()) : 1
ndims(c::AbstractChar) = 0
ndims(::Type{<:AbstractChar}) = 0
length(c::AbstractChar) = 1
IteratorSize(::Type{Char}) = HasShape{0}()
firstindex(c::AbstractChar) = 1
lastindex(c::AbstractChar) = 1
getindex(c::AbstractChar) = c
getindex(c::AbstractChar, i::Integer) = i == 1 ? c : throw(BoundsError())
getindex(c::AbstractChar, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
first(c::AbstractChar) = c
last(c::AbstractChar) = c
eltype(::Type{T}) where {T<:AbstractChar} = T
iterate(c::AbstractChar, done=false) = done ? nothing : (c, true)
isempty(c::AbstractChar) = false
in(x::AbstractChar, y::AbstractChar) = x == y
==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
hash(x::AbstractChar, h::UInt) = hash(Char(x), h)
widen(::Type{T}) where {T<:AbstractChar} = T
@inline -(x::AbstractChar, y::AbstractChar) = Int(x) - Int(y)
@inline -(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) - Int32(y))
@inline +(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) + Int32(y))
@inline +(x::Integer, y::AbstractChar) = y + x
# `print` should output UTF-8 by default for all AbstractChar types.
# (Packages may implement other IO subtypes to specify different encodings.)
# In contrast, `write(io, c)` outputs a `c` in an encoding determined by typeof(c).
print(io::IO, c::Char) = (write(io, c); nothing)
print(io::IO, c::AbstractChar) = print(io, Char(c)) # fallback: convert to output UTF-8
const hex_chars = UInt8['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
function show_invalid(io::IO, c::Char)
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
a = hex_chars[((u >> 28) & 0xf) + 1]
b = hex_chars[((u >> 24) & 0xf) + 1]
write(io, 0x5c, UInt8('x'), a, b)
(u <<= 8) == 0 && break
end
write(io, 0x27)
end
"""
show_invalid(io::IO, c::AbstractChar)
Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
[`ismalformed(c)`](@ref) return `true`. Subclasses
of `AbstractChar` should define `Base.show_invalid` methods
if they support storing invalid character data.
"""
show_invalid
# show c to io, assuming UTF-8 encoded output
function show(io::IO, c::AbstractChar)
if c <= '\\'
b = c == '\0' ? 0x30 :
c == '\a' ? 0x61 :
c == '\b' ? 0x62 :
c == '\t' ? 0x74 :
c == '\n' ? 0x6e :
c == '\v' ? 0x76 :
c == '\f' ? 0x66 :
c == '\r' ? 0x72 :
c == '\e' ? 0x65 :
c == '\'' ? 0x27 :
c == '\\' ? 0x5c : 0xff
if b != 0xff
write(io, 0x27, 0x5c, b, 0x27)
return
end
end
if isoverlong(c) || ismalformed(c)
show_invalid(io, c)
elseif isprint(c)
write(io, 0x27)
print(io, c) # use print, not write, to use UTF-8 for any AbstractChar
write(io, 0x27)
else # unprintable, well-formed, non-overlong Unicode
u = codepoint(c)
write(io, 0x27, 0x5c, u <= 0x7f ? 0x78 : u <= 0xffff ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
end
return
end
function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar}
show(io, c)
get(io, :compact, false) && return
if !ismalformed(c)
print(io, ": ")
if isoverlong(c)
print(io, "[overlong] ")
u = decode_overlong(c)
c = T(u)
else
u = codepoint(c)
end
h = uppercase(string(u, base = 16, pad = 4))
print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
else
print(io, ": Malformed UTF-8")
end
abr = Unicode.category_abbrev(c)
str = Unicode.category_string(c)
print(io, " (category ", abr, ": ", str, ")")
end