Skip to content

Commit 3b02991

Browse files
define ncodeunits(c::Char) as fast equivalent of ncodeunits(string(c))
There was a non-public `codelen(c::Char)` method which previously did this. This also replaces internal uses of this with `ncodeunits(c)`.
1 parent fc04d73 commit 3b02991

File tree

5 files changed

+37
-6
lines changed

5 files changed

+37
-6
lines changed

base/char.jl

+9-1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ Char
5050
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
5151
(::Type{T})(x::T) where {T<:AbstractChar} = x
5252

53+
"""
54+
ncodeunits(c::Char) -> Int
55+
56+
Return the number of code units required to encode a character as UTF-8.
57+
This is the number of bytes which will be printed if the character is written
58+
to an output stream, or `ncodeunits(string(c))` but computed efficiently.
59+
"""
60+
ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient
61+
5362
"""
5463
codepoint(c::AbstractChar) -> Integer
5564
@@ -197,7 +206,6 @@ hash(x::Char, h::UInt) =
197206
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
198207

199208
first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
200-
codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3)
201209

202210
# fallbacks:
203211
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))

base/io.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -974,7 +974,7 @@ function skipchars(predicate, io::IO; linecomment=nothing)
974974
if c === linecomment
975975
readline(io)
976976
elseif !predicate(c)
977-
skip(io, -codelen(c))
977+
skip(io, -ncodeunits(c))
978978
break
979979
end
980980
end

base/strings/substring.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ function string(a::Union{Char, String, SubString{String}}...)
149149
n = 0
150150
for v in a
151151
if v isa Char
152-
n += codelen(v)
152+
n += ncodeunits(v)
153153
else
154154
n += sizeof(v)
155155
end
@@ -159,7 +159,7 @@ function string(a::Union{Char, String, SubString{String}}...)
159159
for v in a
160160
if v isa Char
161161
x = bswap(reinterpret(UInt32, v))
162-
for j in 1:codelen(v)
162+
for j in 1:ncodeunits(v)
163163
unsafe_store!(pointer(out, offs), x % UInt8)
164164
offs += 1
165165
x >>= 8

test/char.jl

+23
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c)
256256
@test_throws MethodError write(IOBuffer(), ASCIIChar('x'))
257257
@test_throws MethodError read(IOBuffer('x'), ASCIIChar)
258258
end
259+
260+
@testset "ncodeunits(::Char)" begin
261+
# valid encodings
262+
@test ncodeunits('\0') == 1
263+
@test ncodeunits('\x1') == 1
264+
@test ncodeunits('\x7f') == 1
265+
@test ncodeunits('\u80') == 2
266+
@test ncodeunits('\uff') == 2
267+
@test ncodeunits('\u7ff') == 2
268+
@test ncodeunits('\u800') == 3
269+
@test ncodeunits('\uffff') == 3
270+
@test ncodeunits('\U10000') == 4
271+
@test ncodeunits('\U10ffff') == 4
272+
# invalid encodings
273+
@test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1
274+
@test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1
275+
@test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2
276+
@test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2
277+
@test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3
278+
@test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3
279+
@test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4
280+
@test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4
281+
end

test/iostream.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
@test read(file, Char) == 'n'
3131

3232
# test it correctly handles unicode
33-
for (byte,char) in zip(1:4, ('@','߷','','𐋺'))
33+
for (byte, char) in zip(1:4, ('@','߷','','𐋺'))
3434
append_to_file("abcdef$char")
35-
@test Base.codelen(char) == byte
35+
@test ncodeunits(char) == byte
3636
@test !eof(skipchars(isletter, file))
3737
@test read(file, Char) == char
3838
end

0 commit comments

Comments
 (0)