Skip to content

Commit

Permalink
Manip for data frames
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed May 14, 2013
1 parent 686e530 commit 174e725
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 12 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ Collate:
'manip-source.r'
'manip.r'
'manip-sqlite.r'
'desc.r'
'manip-data-frame.r'
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
S3method(arrange,data.frame)
S3method(arrange,source)
S3method(arrange,source_sqlite)
S3method(as.data.frame,source_sqlite)
S3method(as.data.frame,standard_ops)
S3method(dim,source_sqlite)
S3method(dimnames,source_sqlite)
S3method(filter,data.frame)
S3method(filter,source)
S3method(filter,source_sqlite)
S3method(group,"function")
S3method(group,default)
S3method(group,source)
S3method(head,source_sqlite)
S3method(mutate,data.frame)
S3method(mutate,source)
S3method(mutate,source_sqlite)
S3method(names,source_sqlite)
S3method(select,data.frame)
S3method(select,source)
S3method(select,source_sqlite)
S3method(source_name,source_sqlite)
S3method(source_vars,source_sqlite)
S3method(summarise,data.frame)
S3method(summarise,source)
S3method(summarise,source_sqlite)
S3method(tail,source_sqlite)
export(arrange)
export(desc)
export(filter)
export(group)
export(mutate)
Expand Down
12 changes: 12 additions & 0 deletions R/desc.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#' Descending order.
#'
#' Transform a vector into a format that will be sorted in descending order.
#'
#' @param x vector to transform
#' @export
#' @examples
#' desc(1:10)
#' desc(factor(letters))
#' first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
#' desc(first_day)
desc <- function(x) -xtfrm(x)
80 changes: 80 additions & 0 deletions R/manip-data-frame.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#' Data manipulation for data frames.
#'
#' @examples
#' data("baseball", package = "plyr")
#
#' filter(baseball, year > 2005, g > 130)
#' head(select(baseball, id:team))
#' summarise(baseball, g = mean(g), n = count())
#' head(mutate(baseball, rbi = 1.0 * r / ab))
#' head(arrange(baseball, id, desc(year)))
#'
#' @name manip_data_frame
NULL

#' @rdname manip_data_frame
#' @export
#' @method filter data.frame
filter.data.frame <- function(.data, ...) {
conds <- dots(...)

r <- vapply(conds, eval, env = .data, enclos = parent.frame(),
FUN.VALUE = logical(nrow(.data)))

all <- rowSums(r, na.rm = TRUE) == ncol(r)
.data[all, , drop = FALSE]
}

#' @rdname manip_data_frame
#' @export
#' @method summarise data.frame
summarise.data.frame <- function(.data, ...) {
cols <- named_dots(...)
data_env <- list2env(.data, parent = parent.frame())
data_env$count <- function() nrow(.data)

for (col in names(cols)) {
data_env[[col]] <- eval(cols[[col]], data_env)
}

as_df(mget(names(cols), data_env))
}

#' @rdname manip_data_frame
#' @export
#' @method mutate data.frame
mutate.data.frame <- function(.data, ...) {
cols <- named_dots(...)
data_env <- list2env(.data, parent = parent.frame())

for(col in names(cols)) {
data_env[[col]] <- eval(cols[[col]], data_env)
}

out_cols <- union(names(.data), names(cols))
as_df(mget(out_cols, data_env))
}

#' @rdname manip_data_frame
#' @export
#' @method arrange data.frame
arrange.data.frame <- function(.data, ...) {
r <- eval(substitute(order(...)), .data, parent.frame())
if(length(r) != nrow(.data)) {
stop("Ordering vectors not the same length as data", call. = FALSE)
}
.data[r, , drop = FALSE]
}

#' @rdname manip_data_frame
#' @export
#' @method select data.frame
select.data.frame <- function(.data, ...) {
nm <- names(.data)
nm_env <- as.list(setNames(seq_along(nm), nm))

idx <- unlist(lapply(dots(...), eval, nm_env, parent.frame()))
select <- nm[idx]

.data[, nm[idx], drop = FALSE]
}
11 changes: 5 additions & 6 deletions R/manip-sqlite.r
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#'
#' @examples
#' data("baseball", package = "plyr")
#' bdf <- data_frame_source(baseball)
#' baseball_s <- sqlite_source("inst/db/baseball.sqlite3", "baseball")
#
#' filter(baseball_s, year > 2005, g > 130)
Expand All @@ -16,7 +15,7 @@ NULL

#' @rdname manip_sqlite
#' @export
#' @method filter source
#' @method filter source_sqlite
filter.source_sqlite <- function(.data, ..., .n = 1e5) {
assert_that(length(.n) == 1, .n > 0L)

Expand All @@ -26,7 +25,7 @@ filter.source_sqlite <- function(.data, ..., .n = 1e5) {

#' @rdname manip_sqlite
#' @export
#' @method summarise source
#' @method summarise source_sqlite
summarise.source_sqlite <- function(.data, ..., .n = 1e5) {
assert_that(length(.n) == 1, .n > 0L)

Expand All @@ -36,7 +35,7 @@ summarise.source_sqlite <- function(.data, ..., .n = 1e5) {

#' @rdname manip_sqlite
#' @export
#' @method mutate source
#' @method mutate source_sqlite
mutate.source_sqlite <- function(.data, ..., .n = 1e5) {
assert_that(length(.n) == 1, .n > 0L)

Expand All @@ -46,7 +45,7 @@ mutate.source_sqlite <- function(.data, ..., .n = 1e5) {

#' @rdname manip_sqlite
#' @export
#' @method arrange source
#' @method arrange source_sqlite
arrange.source_sqlite <- function(.data, ..., .n = 1e5) {
assert_that(length(.n) == 1, .n > 0L)

Expand All @@ -56,7 +55,7 @@ arrange.source_sqlite <- function(.data, ..., .n = 1e5) {

#' @rdname manip_sqlite
#' @export
#' @method select source
#' @method select source_sqlite
select.source_sqlite <- function(.data, ..., .n = 1e5) {
assert_that(length(.n) == 1, .n > 0L)

Expand Down
7 changes: 7 additions & 0 deletions R/utils.r
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,10 @@ names2 <- function(x) {
is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) {
abs(x - round(x)) < tol
}

as_df <- function(x) {
class(x) <- "data.frame"
attr(x, "row.names") <- c(NA_integer_, -length(x[[1]]))

x
}
53 changes: 53 additions & 0 deletions bench/df2env.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# What's the fastest way to get a subset of the colums in
# a data frame into an environment?

# list2env is extremely fast so - 20 µs for 200 cols
# and doesn't seem to grow with underlying data
# so don't worry about it - just use list2env

# Just copy them all
l1 <- function(df, cols) {
list2env(df, parent = emptyenv())
}

# Subset, then copy
l2 <- function(df, cols) {
list2env(df[cols], parent = emptyenv())
}

# Copy by hand
l3 <- function(df, cols) {
env <- new.env(parent = emptyenv())
for (col in cols) {
env[[col]] <- .subset2(df, col)
}
env
}


library(microbenchmark)
options(digits = 3)
data(baseball, package = "plyr")

all <- names(baseball)
print(microbenchmark(
l1(baseball, all),
l2(baseball, all),
l3(baseball, all)
))

long <- do.call("rbind", replicate(10, baseball, simplify = FALSE))
print(microbenchmark(
l1(long, all),
l3(long, all),
l1(long, c("id", "sf")),
l3(long, c("id", "sf"))
))

wide <- do.call("cbind", replicate(10, baseball, simplify = FALSE))
print(microbenchmark(
l1(wide, all),
l3(wide, all),
l1(wide, c("id", "sf")),
l3(wide, c("id", "sf"))
))
20 changes: 20 additions & 0 deletions man/desc.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
\name{desc}
\alias{desc}
\title{Descending order.}
\usage{
desc(x)
}
\arguments{
\item{x}{vector to transform}
}
\description{
Transform a vector into a format that will be sorted in
descending order.
}
\examples{
desc(1:10)
desc(factor(letters))
first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
desc(first_day)
}

31 changes: 31 additions & 0 deletions man/manip_data_frame.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
\name{manip_data_frame}
\alias{arrange.data.frame}
\alias{filter.data.frame}
\alias{manip_data_frame}
\alias{mutate.data.frame}
\alias{select.data.frame}
\alias{summarise.data.frame}
\title{Data manipulation for data frames.}
\usage{
\method{filter}{data.frame} (.data, ...)

\method{summarise}{data.frame} (.data, ...)

\method{mutate}{data.frame} (.data, ...)

\method{arrange}{data.frame} (.data, ...)

\method{select}{data.frame} (.data, ...)
}
\description{
Data manipulation for data frames.
}
\examples{
data("baseball", package = "plyr")
filter(baseball, year > 2005, g > 130)
head(select(baseball, id:team))
summarise(baseball, g = mean(g), n = count())
head(mutate(baseball, rbi = 1.0 * r / ab))
head(arrange(baseball, id, desc(year)))
}

12 changes: 6 additions & 6 deletions man/manip_sqlite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@
\alias{summarise.source_sqlite}
\title{Data manipulation for SQL data sources.}
\usage{
\method{filter}{source} (.data, ..., .n = 1e+05)
\method{filter}{source_sqlite} (.data, ..., .n = 1e+05)

\method{summarise}{source} (.data, ..., .n = 1e+05)
\method{summarise}{source_sqlite} (.data, ...,
.n = 1e+05)

\method{mutate}{source} (.data, ..., .n = 1e+05)
\method{mutate}{source_sqlite} (.data, ..., .n = 1e+05)

\method{arrange}{source} (.data, ..., .n = 1e+05)
\method{arrange}{source_sqlite} (.data, ..., .n = 1e+05)

\method{select}{source} (.data, ..., .n = 1e+05)
\method{select}{source_sqlite} (.data, ..., .n = 1e+05)
}
\description{
Data manipulation for SQL data sources.
}
\examples{
data("baseball", package = "plyr")
bdf <- data_frame_source(baseball)
baseball_s <- sqlite_source("inst/db/baseball.sqlite3", "baseball")
filter(baseball_s, year > 2005, g > 130)
head(select(baseball_s, id:team))
Expand Down

0 comments on commit 174e725

Please sign in to comment.