forked from covid19datahub/COVID19
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid19.R
365 lines (287 loc) · 10.8 KB
/
covid19.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#' COVID-19 Data Hub
#'
#' Unified datasets for a better understanding of COVID-19.
#'
#' @param country vector of country names or \href{https://github.com/covid19datahub/COVID19/blob/master/inst/extdata/db/ISO.csv}{ISO codes} (alpha-2, alpha-3 or numeric).
#' @param level integer. Granularity level. 1: country-level data. 2: state-level data. 3: city-level data.
#' @param start the start date of the period of interest.
#' @param end the end date of the period of interest.
#' @param vintage logical. Retrieve the snapshot of the dataset that was generated at the \code{end} date instead of using the latest version. Default \code{FALSE}.
#' @param raw logical. Skip data cleaning? Default \code{TRUE}. See details.
#' @param cache logical. Memory caching? Significantly improves performance on successive calls. Default \code{TRUE}.
#' @param verbose logical. Print data sources? Default \code{TRUE}.
#' @param debug logical. Warnings and stop on failure? Default \code{FALSE}.
#'
#' @details
#' If \code{raw=FALSE}, the raw data are cleaned by filling missing dates with \code{NA} values.
#' This ensures that all locations share the same grid of dates and no single day is skipped.
#' Then, \code{NA} values are replaced with the previous non-\code{NA} value or \code{0}.
#'
#' @return Grouped \code{tibble} (\code{data.frame})
#'
#' @examples
#' \dontrun{
#'
#' # Worldwide data by country
#' x <- covid19()
#'
#' # Worldwide data by state
#' x <- covid19(level = 2)
#'
#' # Specific country data by city
#' x <- covid19(c("Italy","US"), level = 3)
#'
#' # Data sources
#' s <- attr(x, "src")
#' }
#'
#' @source \url{https://covid19datahub.io}
#'
#' @references
#' Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, \doi{10.21105/joss.02376}.
#'
#' @note
#' We have invested a lot of time and effort in creating \href{https://covid19datahub.io}{COVID-19 Data Hub}, please:
#'
#' \itemize{
#' \item cite \href{https://doi.org/10.21105/joss.02376}{Guidotti and Ardia (2020)} when using \href{https://covid19datahub.io}{COVID-19 Data Hub}.
#' \item place the URL \url{https://covid19datahub.io} in a footnote to help others find \href{https://covid19datahub.io}{COVID-19 Data Hub}.
#' \item you assume full risk for the use of \href{https://covid19datahub.io}{COVID-19 Data Hub}.
#' We try our best to guarantee the data quality and consistency and the continuous filling of the Data Hub.
#' However, it is free software and comes with ABSOLUTELY NO WARRANTY.
#' Reliance on \href{https://covid19datahub.io}{COVID-19 Data Hub} for medical guidance or use of \href{https://covid19datahub.io}{COVID-19 Data Hub} in commerce is strictly prohibited.
#' }
#'
#' @export
#'
covid19 <- function(country = NULL,
level = 1,
start = "2019-01-01",
end = Sys.Date(),
raw = TRUE,
vintage = FALSE,
verbose = TRUE,
cache = TRUE,
debug = FALSE){
# fallback
if(!(level %in% 1:3))
stop("valid options for 'level' are:
1: admin area level 1
2: admin area level 2
3: admin area level 3")
# cache
cachekey <- make.names(sprintf("covid19_%s_%s_%s_%s",paste0(country, collapse = "."), level, ifelse(vintage, end, 0), raw))
if(cache & exists(cachekey, envir = cachedata)){
x <- get(cachekey, envir = cachedata)
return(x[x$date >= start & x$date <= end,])
}
# data
x <- data.frame()
# ISO
iso <- extdata("db","ISO.csv")
if(is.null(country)){
ISO <- iso$iso_alpha_3
}
else {
ISO <- sapply(toupper(country), function(i) iso$iso_alpha_3[which(iso$iso_alpha_2==i | iso$iso_alpha_3==i | iso$iso_numeric==i | toupper(iso$administrative_area_level_1)==i)])
ISO <- as.character(unique(ISO))
}
if(length(ISO)==0)
return(NULL)
# vintage
if(vintage){
url <- "https://storage.covid19datahub.io"
name <- sprintf("%sdata-%s", ifelse(raw, 'raw', ''), level)
# download
if(end == Sys.Date()){
zip <- sprintf("%s/%s.zip", url, name)
file <- sprintf("%s.csv", name)
x <- try(read.zip(zip, file, cache = cache)[[1]], silent = TRUE)
src <- try(read.csv(sprintf("%s/src.csv", url), cache = cache), silent = TRUE)
if("try-error" %in% c(class(x),class(src)) | is.null(x) | is.null(src))
stop(sprintf("vintage data not available today", end))
}
else {
if(end < "2020-04-14")
stop("vintage data not available before 2020-04-14")
zip <- sprintf("%s/%s.zip", url, end)
files <- c(paste0("data-",1:3,".csv"), paste0("rawdata-",1:3,".csv"), "src.csv")
names(files) <- gsub("\\.csv$", "", files)
x <- try(read.zip(zip, files, cache = cache), silent = TRUE)
if("try-error" %in% class(x) | is.null(x))
stop(sprintf("vintage data not available on %s", end))
src <- x[["src"]]
x <- x[[name]]
}
# filter
if(length(ISO)>0)
x <- dplyr::filter(x, iso_alpha_3 %in% ISO)
# check
if(nrow(x)==0)
return(NULL)
}
# download
else {
# world
w <- try(cachecall("world", level = level, cache = cache))
if("try-error" %in% class(w)){
if(debug) stop("WORLD: try-error")
w <- NULL
}
if(!is.null(w)){
if(!ds_check_format(w, level = level, ci = 0.85, verbose = FALSE)){
if(debug) stop("WORLD: check failed")
w <- NULL
}
}
# ISO
for(fun in ISO) if(exists(fun, envir = asNamespace("COVID19"), mode = "function", inherits = FALSE)) {
# try
y <- try(cachecall(fun, level = level, cache = cache))
# skip on NULL
if(is.null(y))
next
# check try-error
if("try-error" %in% class(y)){
if(debug) stop(sprintf("%s: try-error", fun))
next
}
# clean
y <- y[,intersect(colnames(y), c('iso_alpha_3','id','date',vars('fast')))]
# check format
if(!ds_check_format(y, level = level, ci = 0.85, verbose = FALSE)){
if(debug) stop(sprintf("%s: check failed", fun))
next
}
# top level
if(level==1){
# merge fallback
if(!is.null(w)) {
idx <- which(w$iso_alpha_3==fun)
if(length(idx)){
jdx <- which(!(colnames(w) %in% setdiff(colnames(y), "date")))
if(length(jdx))
y <- merge(y, w[idx,jdx], by = 'date', all = TRUE)
}
}
# iso as id
y$id <- fun
}
# add iso and bind data
x <- y %>%
dplyr::mutate(iso_alpha_3 = fun) %>%
dplyr::bind_rows(x)
}
# fallback
if(!is.null(w))
x <- w %>%
dplyr::filter(!(iso_alpha_3 %in% x$iso_alpha_3) & iso_alpha_3 %in% ISO) %>%
dplyr::bind_rows(x)
# filter
x <- x[!is.na(x$id),]
# check
if(nrow(x)==0){
warning("
Sorry, the data are not available.
Help us extending the number of supporting data sources as a joint effort against COVID-19.
Join the mission: https://covid19datahub.io")
return(NULL)
}
# stringency measures
o <- try(cachecall('oxcgrt_git', level = level, cache = cache))
if(!("try-error" %in% class(o))){
# add oxcgrt_id
if(level==1)
x$oxcgrt_id <- x$iso_alpha_3
else
x <- x %>%
dplyr::group_by(iso_alpha_3) %>%
dplyr::group_map(.keep = TRUE, function(x, ...){
x$oxcgrt_id <- get_oxcgrt_id(x$id, iso = unique(x$iso_alpha_3))
return(x)
}) %>%
dplyr::bind_rows()
# merge
xo <- try(merge(x, o, by = c('date','oxcgrt_id'), all.x = TRUE), silent = !debug)
if(!("try-error" %in% class(xo)))
x <- xo
}
else{
if(debug) stop("OxCGRT: try-error")
}
# subset
key <- c('iso_alpha_3','id','date',vars('fast'))
x[,key[!(key %in% colnames(x))]] <- NA
x <- x[,key]
# 0 to NA
for(i in c('hosp','vent','icu'))
x[[i]] <- dplyr::na_if(x[[i]], 0)
# check
if(length(which(idx <- is.na(x$date))))
stop(sprintf("column 'date' contains NA values: %s", paste0(unique(x$iso_alpha_3[idx]), collapse = ", ")))
# clean
dates <- seq(min(x$date), max(x$date), by = 1)
if(!raw)
x <- x %>%
dplyr::group_by(id) %>%
dplyr::group_map(.keep = TRUE, function(x, ...){
miss <- dates[!(dates %in% x$date)]
if(length(miss)>0)
x <- x %>%
dplyr::bind_rows(data.frame(date = miss)) %>%
tidyr::fill(id, iso_alpha_3, .direction = "downup")
return(x)
}) %>%
dplyr::bind_rows() %>%
dplyr::group_by(id) %>%
dplyr::arrange(date) %>%
tidyr::fill(vars("fast"), .direction = "down") %>%
tidyr::replace_na(as.list(sapply(vars("fast"), function(x) 0)))
# check
if(length(idx <- which(duplicated(x[,c("id", "date")]))))
stop(sprintf("multiple dates per id: %s", paste0(unique(x$id[idx]), collapse = ", ")))
# merge top level data
y <- extdata("db","ISO.csv")
if(level>1)
y <- y[,c("iso_alpha_3","iso_alpha_2","iso_numeric","currency","administrative_area_level_1")]
x <- merge(x, y, by = "iso_alpha_3", all.x = TRUE)
# merge lower level data
if(level>1)
x <- x %>%
dplyr::group_by(iso_alpha_3) %>%
dplyr::group_map(.keep = TRUE, function(x, iso){
y <- extdata("db", sprintf("%s.csv",iso[[1]]))
if(!is.null(y))
x <- merge(x, y[,!grepl("^id\\_", colnames(y))], by = "id", all.x = TRUE)
return(x)
}) %>%
dplyr::bind_rows()
# data source
src <- extdata("src.csv")
}
# subset
cn <- colnames(x)
cn <- unique(c(vars(), "key", cn[grepl("^key\\_", cn)]))
x[,cn[!(cn %in% colnames(x))]] <- NA
x <- x[,cn]
# type conversion
x <- x %>%
dplyr::mutate_at('date', as.Date) %>%
dplyr::mutate_at(vars('integer'), as.integer) %>%
dplyr::mutate_at(vars('numeric'), as.numeric) %>%
dplyr::mutate_at(vars('character'), as.character)
# group and order
x <- x %>%
dplyr::group_by(id) %>%
dplyr::arrange(id, date)
# check
if(any(duplicated(x[,c('date','administrative_area_level_1','administrative_area_level_2','administrative_area_level_3')])))
warning("the tuple ('date','administrative_area_level_1','administrative_area_level_2','administrative_area_level_3') is not unique")
# src
attr(x, "src") <- try(cite(x, src, verbose = verbose))
# cache
if(cache)
assign(cachekey, x, envir = cachedata)
# return
return(x[x$date >= start & x$date <= end,])
}