Skip to content

Commit

Permalink
Switch hflights to nycflights.
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Aug 28, 2014
1 parent e42f2ac commit f456296
Show file tree
Hide file tree
Showing 29 changed files with 327 additions and 310 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ install:

before_script:
- psql -c 'create database lahman;' -U postgres
- psql -c 'create database hflights;' -U postgres
- psql -c 'create database nycflights13;' -U postgres
- psql -c 'create database test;' -U postgres
# Install dplyr package then cache sqlite and postgres version of Lahman
- R CMD INSTALL .
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Suggests:
ggplot2,
mgcv,
Lahman,
hflights
nycflights13
VignetteBuilder: knitr
LazyData: yes
LinkingTo: Rcpp (>= 0.11.1),
Expand All @@ -48,9 +48,9 @@ Collate:
'colwise.R'
'compute-collect.r'
'copy-to.r'
'data-hflights.r'
'data-lahman.r'
'data-nasa.r'
'data-nycflights13.r'
'data-temp.r'
'data.r'
'dbi-s3.r'
Expand Down
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,6 @@ export(grouped_dt)
export(groups)
export(has_cluster)
export(has_lahman)
export(hflights_postgres)
export(hflights_sqlite)
export(id)
export(ident)
export(init_cluster)
Expand Down Expand Up @@ -374,6 +372,8 @@ export(n_distinct)
export(n_groups)
export(nth)
export(ntile)
export(nycflights13_postgres)
export(nycflights13_sqlite)
export(order_by)
export(partial_eval)
export(percent_rank)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# dplyr 0.2.0.99

* Now use `nycflights13` instead of `hflights` because it the variables have
better names and there are a few interlinked tables (#562).

* `group_by()` will rename grouping variables (#410).

* When `mutate()` creates a new variable that uses a window function,
Expand Down
28 changes: 14 additions & 14 deletions R/chain.r
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,24 @@
#' @export
#' @examples
#' # If you're performing many operations you can either do step by step
#' if (require("hflights")) {
#' a1 <- group_by(hflights, Year, Month, DayofMonth)
#' a2 <- select(a1, Year:DayofMonth, ArrDelay, DepDelay)
#' if (require("nycflights13")) {
#' a1 <- group_by(flights, year, month, day)
#' a2 <- select(a1, arr_delay, dep_delay)
#' a3 <- summarise(a2,
#' arr = mean(ArrDelay, na.rm = TRUE),
#' dep = mean(DepDelay, na.rm = TRUE))
#' arr = mean(arr_delay, na.rm = TRUE),
#' dep = mean(dep_delay, na.rm = TRUE))
#' a4 <- filter(a3, arr > 30 | dep > 30)
#'
#' # If you don't want to save the intermediate results, you need to
#' # wrap the functions:
#' filter(
#' summarise(
#' select(
#' group_by(hflights, Year, Month, DayofMonth),
#' Year:DayofMonth, ArrDelay, DepDelay
#' group_by(flights, year, month, day),
#' arr_delay, dep_delay
#' ),
#' arr = mean(ArrDelay, na.rm = TRUE),
#' dep = mean(DepDelay, na.rm = TRUE)
#' arr = mean(arr_delay, na.rm = TRUE),
#' dep = mean(dep_delay, na.rm = TRUE)
#' ),
#' arr > 30 | dep > 30
#' )
Expand All @@ -53,12 +53,12 @@
#' # Alternatively you can use chain or %>% to sequence the operations
#' # linearly:
#'
#' hflights %>%
#' group_by(Year, Month, DayofMonth) %>%
#' select(Year:DayofMonth, ArrDelay, DepDelay) %>%
#' flights %>%
#' group_by(year, month, day) %>%
#' select(arr_delay, dep_delay) %>%
#' summarise(
#' arr = mean(ArrDelay, na.rm = TRUE),
#' dep = mean(DepDelay, na.rm = TRUE)
#' arr = mean(arr_delay, na.rm = TRUE),
#' dep = mean(dep_delay, na.rm = TRUE)
#' ) %>%
#' filter(arr > 30 | dep > 30)
#' }
Expand Down
50 changes: 0 additions & 50 deletions R/data-hflights.r

This file was deleted.

53 changes: 53 additions & 0 deletions R/data-nycflights13.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#' Database versions of the nycflights13 data
#'
#' These functions cache the data from the \code{nycflights13} database in
#' a local database, for use in examples and vignettes. Indexes are created
#' to making joining tables on natural keys efficient.
#'
#' @keywords internal
#' @name nycflights13
NULL

#' @export
#' @rdname nycflights13
#' @param path location of sqlite database file
nycflights13_sqlite <- function(path = NULL) {
cache_computation("nycflights_sqlite", {
path <- db_location(path, "nycflights13.sqlite")
message("Caching nycflights db at ", path)
src <- src_sqlite(path, create = TRUE)
cache_nycflights13(src)
})
}

#' @export
#' @rdname nycflights13
#' @param dbname,... Arguments passed on to \code{\link{src_postgres}}
nycflights13_postgres <- function(dbname = "nycflights13", ...) {
cache_computation("nycflights_postgres", {
message("Caching nycflights db in postgresql db ", dbname)
cache_nycflights13(src_postgres(dbname, ...))
})
}

cache_nycflights13 <- function(src, ...) {
all <- data(package = "nycflights13")$results[, 3]
index <- list(
airlines = list("carrier"),
airports = list("faa"),
flights = list(c("year", "month", "day"), "carrier", "tailnum", "origin", "dest"),
planes = list("tailnum"),
weather = list(c("year", "month", "day"), "origin")
)

tables <- setdiff(all, src_tbls(src))

# Create missing tables
for(table in tables) {
df <- getExportedValue("nycflights13", table)
message("Creating table: ", table)

copy_to(src, df, table, indexes = index[[table]], temporary = FALSE)
}
src
}
10 changes: 10 additions & 0 deletions R/data.r
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ get_cache <- function(name) {
get(name, envir = cache)
}

cache_computation <- function(name, computation) {
if (is_cached(name)) {
get_cache(name)
} else {
res <- force(computation)
set_cache(name, res)
res
}
}

load_srcs <- function(f, src_names, quiet = NULL) {
if (is.null(quiet)) {
quiet <- !identical(Sys.getenv("NOT_CRAN"), "true")
Expand Down
10 changes: 5 additions & 5 deletions R/do.r
Original file line number Diff line number Diff line change
Expand Up @@ -58,21 +58,21 @@
#' compare <- models %>% do(aov = anova(.$mod_linear, .$mod_quad))
#' # compare %>% summarise(p.value = aov$`Pr(>F)`)
#'
#' if (require("hflights")) {
#' if (require("nycflights13")) {
#' # You can use it to do any arbitrary computation, like fitting a linear
#' # model. Let's explore how carrier departure delays vary over the time
#' carriers <- group_by(hflights, UniqueCarrier)
#' carriers <- group_by(flights, carrier)
#' group_size(carriers)
#'
#' mods <- do(carriers, mod = lm(ArrDelay ~ DepTime, data = .))
#' mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
#' mods %>% do(as.data.frame(coef(.$mod)))
#' mods %>% summarise(rsq = summary(mod)$r.squared)
#'
#' \dontrun{
#' # This longer example shows the progress bar in action
#' by_dest <- hflights %>% group_by(Dest) %>% filter(n() > 100)
#' by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
#' library(mgcv)
#' by_dest %>% do(smooth = gam(ArrDelay ~ s(DepTime) + Month, data = .))
#' by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
#' }
#' }
do <- function(.data, ...) UseMethod("do")
Expand Down
6 changes: 3 additions & 3 deletions R/group-size.r
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
#' @param x a grouped tbl
#' @export
#' @examples
#' if (require("hflights")) {
#' if (require("nycflights13")) {
#'
#' by_day <- hflights %>% group_by(Year, Month, DayofMonth)
#' by_day <- flights %>% group_by(year, month, day)
#' n_groups(by_day)
#' group_size(by_day)
#'
#' by_dest <- hflights %>% group_by(Dest)
#' by_dest <- flights %>% group_by(dest)
#' n_groups(by_dest)
#' group_size(by_dest)
#' }
Expand Down
12 changes: 6 additions & 6 deletions R/grouped-dt.r
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
#' @param vars a list of quoted variables.
#' @export
#' @examples
#' if (require("data.table") && require("hflights")) {
#' hflights_dt <- tbl_dt(hflights)
#' group_size(group_by(hflights_dt, Year, Month, DayofMonth))
#' group_size(group_by(hflights_dt, Dest))
#' if (require("data.table") && require("nycflights")) {
#' flights_dt <- tbl_dt(flights)
#' group_size(group_by(flights_dt, year, month, day))
#' group_size(group_by(flights_dt, dest))
#'
#' monthly <- group_by(hflights_dt, Month)
#' summarise(monthly, n = n(), delay = mean(ArrDelay))
#' monthly <- group_by(flights_dt, month)
#' summarise(monthly, n = n(), delay = mean(arr_delay))
#' }
grouped_dt <- function(data, vars) {
stopifnot(is.data.table(data))
Expand Down
26 changes: 13 additions & 13 deletions R/manip-dt.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@
#' @param .env The environment in which to evaluate arguments not included
#' in the data. The default should suffice for ordinary usage.
#' @examples
#' if (require("data.table") && require("hflights")) {
#' if (require("data.table") && require("nycflights13")) {
#' # If you start with a data table, you end up with a data table
#' hflights <- as.data.table(hflights)
#' filter(hflights, Month == 1, DayofMonth == 1, Dest == "DFW")
#' head(select(hflights, Year:DayOfWeek))
#' summarise(hflights, delay = mean(ArrDelay, na.rm = TRUE), n = length(ArrDelay))
#' head(mutate(hflights, gained = ArrDelay - DepDelay))
#' head(arrange(hflights, Dest, desc(ArrDelay)))
#' flights <- as.data.table(flights)
#' filter(flights, month == 1, day == 1, dest == "DFW")
#' head(select(flights, year:day))
#' summarise(flights, delay = mean(arr_delay, na.rm = TRUE), n = length(arr_delay))
#' head(mutate(flights, gained = arr_delay - dep_delay))
#' head(arrange(flights, dest, desc(arr_delay)))
#'
#' # If you start with a tbl, you end up with a tbl
#' hflights2 <- as.tbl(hflights)
#' filter(hflights2, Month == 1, DayofMonth == 1, Dest == "DFW")
#' head(select(hflights2, Year:DayOfWeek))
#' summarise(hflights2, delay = mean(ArrDelay, na.rm = TRUE), n = length(ArrDelay))
#' head(mutate(hflights2, gained = ArrDelay - DepDelay))
#' head(arrange(hflights2, Dest, desc(ArrDelay)))
#' flights2 <- as.tbl(flights)
#' filter(flights2, month == 1, day == 1, dest == "DFW")
#' head(select(flights2, year:day))
#' summarise(flights2, delay = mean(arr_delay, na.rm = TRUE), n = length(arr_delay))
#' head(mutate(flights2, gained = arr_delay - dep_delay))
#' head(arrange(flights2, dest, desc(arr_delay)))
#' }
#' @name manip_dt
NULL
Expand Down
22 changes: 11 additions & 11 deletions R/manip-grouped-dt.r
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
#' @param inplace if \code{FALSE} (the default) the data frame will be copied
#' prior to modification to avoid changes propagating via reference.
#' @examples
#' if (require("data.table") && require("hflights")) {
#' hflights2 <- tbl_dt(hflights)
#' by_dest <- group_by(hflights2, Dest)
#' if (require("data.table") && require("nycflights13")) {
#' flights2 <- tbl_dt(flights)
#' by_dest <- group_by(flights2, dest)
#'
#' filter(by_dest, ArrDelay == max(ArrDelay, na.rm = TRUE))
#' summarise(by_dest, arr = mean(ArrDelay, na.rm = TRUE))
#' filter(by_dest, arr_delay == max(arr_delay, na.rm = TRUE))
#' summarise(by_dest, arr = mean(arr_delay, na.rm = TRUE))
#'
#' # Normalise arrival and departure delays by airport
#' scaled <- mutate(by_dest, arr_z = scale(ArrDelay), dep_z = scale(DepDelay))
#' select(scaled, Year:DayOfWeek, Dest, arr_z:dep_z)
#' scaled <- mutate(by_dest, arr_z = scale(arr_delay), dep_z = scale(dep_delay))
#' select(scaled, year:day, dest, arr_z:dep_z)
#'
#' arrange(by_dest, desc(ArrDelay))
#' select(by_dest, -(DayOfWeek:TailNum))
#' arrange(by_dest, desc(arr_delay))
#' select(by_dest, -(day:tailnum))
#'
#' # All manip functions preserve grouping structure, except for summarise
#' # which removes a grouping level
#' by_day <- group_by(hflights, Year, Month, DayofMonth)
#' by_month <- summarise(by_day, delayed = sum(ArrDelay > 0, na.rm = TRUE))
#' by_day <- group_by(flights, year, month, day)
#' by_month <- summarise(by_day, delayed = sum(arr_delay > 0, na.rm = TRUE))
#' by_month
#' summarise(by_month, delayed = sum(delayed))
#'
Expand Down
7 changes: 4 additions & 3 deletions R/manip.r
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,12 @@ rename <- function(.data, ...) UseMethod("rename")
#'
#' @export
#' @examples
#' data("hflights", package = "hflights")
#' carriers <- group_by(hflights, UniqueCarrier)
#' if (require("nycflights13")) {
#' carriers <- group_by(flights, carrier)
#' summarise(carriers, n())
#' mutate(carriers, n = n())
#' filter(carriers, n() == 79)
#' filter(carriers, n() < 100)
#' }
n <- function() {
stop("This function should not be called directly")
}
Loading

0 comments on commit f456296

Please sign in to comment.