forked from JuliaData/CSV.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.jl
423 lines (364 loc) · 16.1 KB
/
utils.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
export PooledString
"""
PooledString
A singleton type that can be used for signaling that a column of a csv file should be pooled,
with the output array type being a `PooledArray`.
"""
struct PooledString <: AbstractString end
# PointerString is an internal-only type for efficiently tracking string data + length
# all strings indexed from a column/row will always be a full String
# specifically, it allows avoiding materializing full Strings for pooled string columns while parsing
# and allows a fastpath for materializing a full String when no escaping is needed
struct PointerString <: AbstractString
ptr::Ptr{UInt8}
len::Int
end
function Base.hash(s::PointerString, h::UInt)
h += Base.memhash_seed
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.ptr, s.len, h % UInt32) + h
end
import Base: ==
function ==(x::String, y::PointerString)
sizeof(x) == y.len && ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), pointer(x), y.ptr, y.len) == 0
end
function ==(x::PointerString, y::PointerString)
x.len == y.len && ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), x.ptr, y.ptr, y.len) == 0
end
==(y::PointerString, x::String) = x == y
Base.ncodeunits(s::PointerString) = s.len
@inline function Base.codeunit(s::PointerString, i::Integer)
@boundscheck checkbounds(s, i)
GC.@preserve s unsafe_load(s.ptr + i - 1)
end
Base.String(x::PointerString) = _unsafe_string(x.ptr, x.len)
# column bit flags; useful so we don't have to pass a bunch of arguments/state around manually
# whether the user provided the type or not
const USER = 0b00000001
user(flag) = flag & USER > 0
# whether any missing values have been found in this column so far
const ANYMISSING = 0b00000010
anymissing(flag) = flag & ANYMISSING > 0
# whether a column type has been detected yet
const TYPEDETECTED = 0b00000100
typedetected(flag) = flag & TYPEDETECTED > 0
# whether a column will be "dropped" from the select/drop keyword arguments
const WILLDROP = 0b00001000
willdrop(flag) = flag & WILLDROP > 0
# whether strings should be lazy; results in LazyStringVectors
# this setting isn't per column, but we store it on the column bit flags anyway for convenience
const LAZYSTRINGS = 0b00010000
lazystrings(flag) = flag & LAZYSTRINGS > 0
const MAYBEPOOLED = 0b00100000
maybepooled(flag) = flag & MAYBEPOOLED > 0
# ~95% z-score, 10% MoE
const POOLSAMPLESIZE = 100
flag(T, lazystrings) = (T === Union{} ? 0x00 : ((USER | TYPEDETECTED) | (hasmissingtype(T) ? ANYMISSING : 0x00))) | (lazystrings ? LAZYSTRINGS : 0x00)
# we define our own bit flag on a Parsers.ReturnCode to signal if a column needs to promote to string
const PROMOTE_TO_STRING = 0b0100000000000000 % Int16
promote_to_string(code) = code & PROMOTE_TO_STRING > 0
hasmissingtype(T) = T === Missing || T !== ts(T, Missing)
@inline function promote_types(@nospecialize(T), @nospecialize(S))
if T === Union{} || S === Union{} || T === Missing || S === Missing || T === S || Base.nonmissingtype(T) === Base.nonmissingtype(S)
return Union{T, S}
elseif T === Int64
return S === Float64 ? S : S === Union{Float64, Missing} ? S : hasmissingtype(S) ? Union{String, Missing} : String
elseif T === Union{Int64, Missing}
return S === Float64 || S === Union{Float64, Missing} ? Union{Float64, Missing} : Union{String, Missing}
elseif T === Float64
return S === Int64 ? T : S === Union{Int64, Missing} ? Union{Float64, Missing} : hasmissingtype(S) ? Union{String, Missing} : String
elseif T === Union{Float64, Missing}
return S === Int64 || S === Union{Int64, Missing} ? Union{Float64, Missing} : Union{String, Missing}
elseif hasmissingtype(T) || hasmissingtype(S)
return Union{String, Missing}
else
return String
end
end
## lazy strings
# bit patterns for missing value, int value, escaped string, position and len in lazy string parsing
primitive type PosLen 64 end
PosLen(x::UInt64) = Core.bitcast(PosLen, x)
UInt64(x::PosLen) = Core.bitcast(UInt64, x)
Base.convert(::Type{PosLen}, x::UInt64) = PosLen(x)
Base.convert(::Type{UInt64}, x::PosLen) = UInt64(x)
const MISSING_BIT = 0x8000000000000000
missingvalue(x) = (UInt64(x) & MISSING_BIT) == MISSING_BIT
const ESCAPE_BIT = 0x4000000000000000
escapedvalue(x) = (UInt64(x) & ESCAPE_BIT) == ESCAPE_BIT
getpos(x) = (UInt64(x) & 0x3ffffffffff00000) >> 20
getlen(x) = UInt64(x) & 0x00000000000fffff
_unsafe_string(p, len) = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
@inline function str(buf, e, poslen)
missingvalue(poslen) && return missing
escapedvalue(poslen) && return unescape(PointerString(pointer(buf, getpos(poslen)), getlen(poslen)), e)
pos, len = getpos(poslen), getlen(poslen)
return _unsafe_string(pointer(buf, getpos(poslen)), getlen(poslen))
end
@inline function strnomiss(buf, e, poslen)
escapedvalue(poslen) && return unescape(PointerString(pointer(buf, getpos(poslen)), getlen(poslen)), e)
pos, len = getpos(poslen), getlen(poslen)
return _unsafe_string(pointer(buf, getpos(poslen)), getlen(poslen))
end
struct LazyStringVector{T, A <: AbstractVector{PosLen}} <: AbstractVector{T}
buffer::Vector{UInt8}
e::UInt8
poslens::A
end
LazyStringVector{T}(buffer, e, poslens::A) where {T, A} = LazyStringVector{T, A}(buffer, e, poslens)
Base.IndexStyle(::Type{LazyStringVector}) = Base.IndexLinear()
Base.size(x::LazyStringVector) = (length(x.poslens),)
Base.@propagate_inbounds function Base.getindex(x::LazyStringVector{Union{String, Missing}}, i::Int)
@boundscheck checkbounds(x, i)
@inbounds s = str(x.buffer, x.e, x.poslens[i])
return s
end
Base.@propagate_inbounds function Base.getindex(x::LazyStringVector{String}, i::Int)
@boundscheck checkbounds(x, i)
@inbounds s = strnomiss(x.buffer, x.e, x.poslens[i])
return s
end
# optimize iterate for ChainedVector
@inline function Base.iterate(x::LazyStringVector{T}) where {T}
st = iterate(x.poslens)
st === nothing && return nothing
@inbounds s = T === String ? strnomiss(x.buffer, x.e, st[1]) : str(x.buffer, x.e, st[1])
return s, st[2]
end
@inline function Base.iterate(x::LazyStringVector{T}, state) where {T}
st = iterate(x.poslens, state)
st === nothing && return nothing
@inbounds s = T === String ? strnomiss(x.buffer, x.e, st[1]) : str(x.buffer, x.e, st[1])
return s, st[2]
end
## column array allocating
# we don't want to use SentinelVector for small integer types due to the higher risk of
# sentinel value collision, so we just use Vector{Union{T, Missing}} and convert to Vector{T} if no missings were found
const SmallIntegers = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32}
# allocate columns for a full file
function allocate(rowsguess, ncols, types, flags, refs)
columns = Vector{AbstractVector}(undef, ncols)
for i = 1:ncols
@inbounds columns[i] = allocate(lazystrings(flags[i]) && (types[i] === String || types[i] === Union{String, Missing}) ? PosLen : types[i], rowsguess)
if types[i] === PooledString || types[i] === Union{PooledString, Missing}
refs[i] = RefPool()
end
end
return columns
end
# MissingVector is an efficient representation in SentinelArrays.jl package
allocate(::Type{Union{}}, len) = MissingVector(len)
allocate(::Type{Missing}, len) = MissingVector(len)
function allocate(::Type{PosLen}, len)
A = Vector{PosLen}(undef, len)
memset!(pointer(A), typemax(UInt8), sizeof(A))
return A
end
allocate(::Type{String}, len) = SentinelVector{String}(undef, len)
allocate(::Type{Union{String, Missing}}, len) = SentinelVector{String}(undef, len)
allocate(::Type{PooledString}, len) = Vector{UInt32}(undef, len)
allocate(::Type{Union{PooledString, Missing}}, len) = Vector{UInt32}(undef, len)
allocate(::Type{Bool}, len) = Vector{Union{Missing, Bool}}(undef, len)
allocate(::Type{Union{Missing, Bool}}, len) = Vector{Union{Missing, Bool}}(undef, len)
allocate(::Type{T}, len) where {T <: SmallIntegers} = Vector{Union{Missing, T}}(undef, len)
allocate(::Type{Union{Missing, T}}, len) where {T <: SmallIntegers} = Vector{Union{Missing, T}}(undef, len)
allocate(T, len) = SentinelVector{Base.nonmissingtype(T)}(undef, len)
reallocate!(A, len) = resize!(A, len)
# when reallocating, we just need to make sure the missing bit is set for lazy string PosLen
function reallocate!(A::Vector{PosLen}, len)
oldlen = length(A)
resize!(A, len)
memset!(pointer(A, oldlen + 1), typemax(UInt8), (len - oldlen) * 8)
return
end
const SVec{T} = SentinelVector{T, T, Missing, Vector{T}}
const SVec2{T} = SentinelVector{T, typeof(undef), Missing, Vector{T}}
if applicable(Core.Compiler.typesubtract, Union{Int, Missing}, Missing)
ts(T, S) = Core.Compiler.typesubtract(T, S)
else
ts(T, S) = Core.Compiler.typesubtract(T, S, 16)
end
# when users pass non-standard types, we need to keep track of them in a Tuple{...} to generate efficient custom parsing kernel codes
function nonstandardtype(T)
if T === Union{}
return T
end
S = ts(ts(ts(ts(ts(ts(ts(ts(ts(T, Int64), Float64), String), PooledString), Bool), Date), DateTime), Time), Missing)
if S === Union{}
return S
elseif S <: SmallIntegers
return Tuple{Vector{Union{Missing, S}}, S}
elseif isbitstype(S)
return Tuple{SVec{S}, S}
else
return Tuple{SVec2{S}, S}
end
end
# one-liner suggested from ScottPJones
consumeBOM(buf, pos) = (length(buf) >= 3 && buf[pos] == 0xef && buf[pos + 1] == 0xbb && buf[pos + 2] == 0xbf) ? pos + 3 : pos
# whatever input is given, turn it into an AbstractVector{UInt8} we can parse with
function getsource(x)
if x isa AbstractVector{UInt8}
return x, 1, length(x)
elseif x isa Base.GenericIOBuffer
return x.data, x.ptr, x.size
elseif x isa Cmd || x isa IO
Base.depwarn("`CSV.File` or `CSV.Rows` with `$(typeof(x))` object is deprecated; pass a filename, `IOBuffer`, or byte buffer directly (via `read(x)`)", :getsource)
buf = Base.read(x)
return buf, 1, length(buf)
else
try
buf = Mmap.mmap(string(x))
return buf, 1, length(buf)
catch e
# if we can't mmap, try just `read`ing the whole thing into a byte vector
buf = read(x)
return buf, 1, length(buf)
end
end
end
getname(buf::Vector{UInt8}) = "<raw buffer>"
getname(cmd::Cmd) = string(cmd)
getname(str) = string(str)
getname(io::I) where {I <: IO} = string("<", I, ">")
# normalizing column name utilities
const RESERVED = Set(["local", "global", "export", "let",
"for", "struct", "while", "const", "continue", "import",
"function", "if", "else", "try", "begin", "break", "catch",
"return", "using", "baremodule", "macro", "finally",
"module", "elseif", "end", "quote", "do"])
normalizename(name::Symbol) = name
function normalizename(name::String)::Symbol
uname = strip(Unicode.normalize(name))
id = Base.isidentifier(uname) ? uname : map(c->Base.is_id_char(c) ? c : '_', uname)
cleansed = string((isempty(id) || !Base.is_id_start_char(id[1]) || id in RESERVED) ? "_" : "", id)
return Symbol(replace(cleansed, r"(_)\1+"=>"_"))
end
function makeunique(names)
set = Set(names)
length(set) == length(names) && return Symbol[Symbol(x) for x in names]
nms = Symbol[]
for nm in names
if nm in nms
k = 1
newnm = Symbol("$(nm)_$k")
while newnm in set || newnm in nms
k += 1
newnm = Symbol("$(nm)_$k")
end
nm = newnm
end
push!(nms, nm)
end
return nms
end
initialtypes(T, x::AbstractDict{String}, names) = Type[haskey(x, string(nm)) ? x[string(nm)] : T for nm in names]
initialtypes(T, x::AbstractDict{Symbol}, names) = Type[haskey(x, nm) ? x[nm] : T for nm in names]
initialtypes(T, x::AbstractDict{Int}, names) = Type[haskey(x, i) ? x[i] : T for i = 1:length(names)]
initialflags(T, x::AbstractDict{String}, names, lazystrings) = UInt8[haskey(x, string(nm)) ? flag(x[string(nm)], lazystrings) : T for nm in names]
initialflags(T, x::AbstractDict{Symbol}, names, lazystrings) = UInt8[haskey(x, nm) ? flag(x[nm], lazystrings) : T for nm in names]
initialflags(T, x::AbstractDict{Int}, names, lazystrings) = UInt8[haskey(x, i) ? flag(x[i], lazystrings) : T for i = 1:length(names)]
# given a DateFormat, is it meant for parsing Date, DateTime, or Time?
function timetype(df::Dates.DateFormat)
date = false
time = false
for token in df.tokens
T = typeof(token)
if T in (Dates.DatePart{'H'}, Dates.DatePart{'I'}, Dates.DatePart{'M'}, Dates.DatePart{'S'}, Dates.DatePart{'s'})
time = true
elseif T in (Dates.DatePart{'y'}, Dates.DatePart{'Y'}, Dates.DatePart{'m'}, Dates.DatePart{'d'}, Dates.DatePart{'u'}, Dates.DatePart{'U'})
date = true
end
end
return ifelse(date & time, DateTime, ifelse(time, Time, Date))
end
# if a cell value of a csv file has escape characters, we need to unescape it
function unescape(s, e)
n = ncodeunits(s)
buf = Base.StringVector(n)
len = 1
i = 1
@inbounds begin
while i <= n
b = codeunit(s, i)
if b == e
i += 1
b = codeunit(s, i)
end
@inbounds buf[len] = b
len += 1
i += 1
end
end
resize!(buf, len - 1)
return String(buf)
end
"""
CSV.detect(str::String)
Use the same logic used by `CSV.File` to detect column types, to parse a value from a plain string.
This can be useful in conjunction with the `CSV.Rows` type, which returns each cell of a file as a String.
The order of types attempted is: `Int64`, `Float64`, `Date`, `DateTime`, `Bool`, and if all fail, the input String is returned.
No errors are thrown.
For advanced usage, you can pass your own `Parsers.Options` type as a keyword argument `option=ops` for sentinel value detection.
"""
function detect end
detect(str::String; options=Parsers.OPTIONS) = something(detect(codeunits(str), 1, sizeof(str), options), str)
function detect(buf, pos, len, options)
int, code, vpos, vlen, tlen = Parsers.xparse(Int64, buf, pos, len, options)
if Parsers.sentinel(code) && code > 0
return missing
end
if Parsers.ok(code) && vpos + vlen - 1 == len
return int
end
float, code, vpos, vlen, tlen = Parsers.xparse(Float64, buf, pos, len, options)
if Parsers.ok(code) && vpos + vlen - 1 == len
return float
end
if options.dateformat === nothing
try
date, code, vpos, vlen, tlen = Parsers.xparse(Date, buf, pos, len, options)
if Parsers.ok(code) && vpos + vlen - 1 == len
return date
end
catch e
end
try
datetime, code, vpos, vlen, tlen = Parsers.xparse(DateTime, buf, pos, len, options)
if Parsers.ok(code) && vpos + vlen - 1 == len
return datetime
end
catch e
end
else
try
# use user-provided dateformat
DT = timetype(options.dateformat)
dt, code, vpos, vlen, tlen = Parsers.xparse(DT, buf, pos, len, options)
if Parsers.ok(code) && vpos + vlen - 1 == len
return dt
end
catch e
end
end
bool, code, vpos, vlen, tlen = Parsers.xparse(Bool, buf, pos, len, options)
if Parsers.ok(code) && vpos + vlen - 1 == len
return bool
end
return nothing
end
# a ReversedBuf takes a byte vector and indexes backwards;
# used for the footerskip keyword argument, which starts at the bottom of the file
# and skips lines backwards
struct ReversedBuf <: AbstractVector{UInt8}
buf::Vector{UInt8}
end
Base.size(a::ReversedBuf) = size(a.buf)
Base.IndexStyle(::Type{ReversedBuf}) = Base.IndexLinear()
Base.getindex(a::ReversedBuf, i::Int) = a.buf[end + 1 - i]
memset!(ptr, value, num) = ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), ptr, value, num)
# a RefPool holds our refs as a Dict, along with a lastref field which is incremented when a new ref is found while parsing pooled columns
mutable struct RefPool
refs::Dict{Union{String, Missing}, UInt32}
lastref::UInt32
end
RefPool() = RefPool(Dict{Union{String, Missing}, UInt32}(), 0)