Manip for data frames

kvpradap · May 14, 2013 · 174e725 · 174e725
1 parent 686e530
commit 174e725
Show file tree

Hide file tree

Showing 10 changed files with 227 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -36,3 +36,5 @@ Collate:
     'manip-source.r'
     'manip.r'
     'manip-sqlite.r'
+    'desc.r'
+    'manip-data-frame.r'
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,21 +1,32 @@
+S3method(arrange,data.frame)
 S3method(arrange,source)
+S3method(arrange,source_sqlite)
 S3method(as.data.frame,source_sqlite)
 S3method(as.data.frame,standard_ops)
 S3method(dim,source_sqlite)
 S3method(dimnames,source_sqlite)
+S3method(filter,data.frame)
 S3method(filter,source)
+S3method(filter,source_sqlite)
 S3method(group,"function")
 S3method(group,default)
 S3method(group,source)
 S3method(head,source_sqlite)
+S3method(mutate,data.frame)
 S3method(mutate,source)
+S3method(mutate,source_sqlite)
 S3method(names,source_sqlite)
+S3method(select,data.frame)
 S3method(select,source)
+S3method(select,source_sqlite)
 S3method(source_name,source_sqlite)
 S3method(source_vars,source_sqlite)
+S3method(summarise,data.frame)
 S3method(summarise,source)
+S3method(summarise,source_sqlite)
 S3method(tail,source_sqlite)
 export(arrange)
+export(desc)
 export(filter)
 export(group)
 export(mutate)

diff --git a/R/desc.r b/R/desc.r
@@ -0,0 +1,12 @@
+#' Descending order.
+#'
+#' Transform a vector into a format that will be sorted in descending order.
+#'
+#' @param x vector to transform
+#' @export
+#' @examples
+#' desc(1:10)
+#' desc(factor(letters))
+#' first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
+#' desc(first_day)
+desc <- function(x) -xtfrm(x)
diff --git a/R/manip-data-frame.r b/R/manip-data-frame.r
@@ -0,0 +1,80 @@
+#' Data manipulation for data frames.
+#'
+#' @examples
+#' data("baseball", package = "plyr")
+#
+#' filter(baseball, year > 2005, g > 130)
+#' head(select(baseball, id:team))
+#' summarise(baseball, g = mean(g), n = count())
+#' head(mutate(baseball, rbi = 1.0 * r / ab))
+#' head(arrange(baseball, id, desc(year)))
+#'
+#' @name manip_data_frame
+NULL
+
+#' @rdname manip_data_frame
+#' @export
+#' @method filter data.frame
+filter.data.frame <- function(.data, ...) {
+  conds <- dots(...)
+
+  r <- vapply(conds, eval, env = .data, enclos = parent.frame(),
+    FUN.VALUE = logical(nrow(.data)))
+
+  all <- rowSums(r, na.rm = TRUE) == ncol(r)
+  .data[all, , drop = FALSE]
+}
+
+#' @rdname manip_data_frame
+#' @export
+#' @method summarise data.frame
+summarise.data.frame <- function(.data, ...) {
+  cols <- named_dots(...)
+  data_env <- list2env(.data, parent = parent.frame())
+  data_env$count <- function() nrow(.data)
+
+  for (col in names(cols)) {
+    data_env[[col]] <- eval(cols[[col]], data_env)
+  }
+
+  as_df(mget(names(cols), data_env))
+}
+
+#' @rdname manip_data_frame
+#' @export
+#' @method mutate data.frame
+mutate.data.frame <- function(.data, ...) {
+  cols <- named_dots(...)
+  data_env <- list2env(.data, parent = parent.frame())
+
+  for(col in names(cols)) {
+    data_env[[col]] <- eval(cols[[col]], data_env)
+  }
+
+  out_cols <- union(names(.data), names(cols))
+  as_df(mget(out_cols, data_env))
+}
+
+#' @rdname manip_data_frame
+#' @export
+#' @method arrange data.frame
+arrange.data.frame <- function(.data, ...) {
+  r <- eval(substitute(order(...)), .data, parent.frame())
+  if(length(r) != nrow(.data)) {
+    stop("Ordering vectors not the same length as data", call. = FALSE)
+  }
+  .data[r, , drop = FALSE]
+}
+
+#' @rdname manip_data_frame
+#' @export
+#' @method select data.frame
+select.data.frame <- function(.data, ...) {
+  nm <- names(.data)
+  nm_env <- as.list(setNames(seq_along(nm), nm))
+
+  idx <- unlist(lapply(dots(...), eval, nm_env, parent.frame()))
+  select <- nm[idx]
+
+  .data[, nm[idx], drop = FALSE]
+}
diff --git a/R/manip-sqlite.r b/R/manip-sqlite.r
@@ -2,7 +2,6 @@
 #'
 #' @examples
 #' data("baseball", package = "plyr")
-#' bdf <- data_frame_source(baseball)
 #' baseball_s <- sqlite_source("inst/db/baseball.sqlite3", "baseball")
 #
 #' filter(baseball_s, year > 2005, g > 130)
@@ -16,7 +15,7 @@ NULL
 
 #' @rdname manip_sqlite
 #' @export
-#' @method filter source
+#' @method filter source_sqlite
 filter.source_sqlite <- function(.data, ..., .n = 1e5) {
   assert_that(length(.n) == 1, .n > 0L)
 
@@ -26,7 +25,7 @@ filter.source_sqlite <- function(.data, ..., .n = 1e5) {
 
 #' @rdname manip_sqlite
 #' @export
-#' @method summarise source
+#' @method summarise source_sqlite
 summarise.source_sqlite <- function(.data, ..., .n = 1e5) {
   assert_that(length(.n) == 1, .n > 0L)
 
@@ -36,7 +35,7 @@ summarise.source_sqlite <- function(.data, ..., .n = 1e5) {
 
 #' @rdname manip_sqlite
 #' @export
-#' @method mutate source
+#' @method mutate source_sqlite
 mutate.source_sqlite <- function(.data, ..., .n = 1e5) {
   assert_that(length(.n) == 1, .n > 0L)
 
@@ -46,7 +45,7 @@ mutate.source_sqlite <- function(.data, ..., .n = 1e5) {
 
 #' @rdname manip_sqlite
 #' @export
-#' @method arrange source
+#' @method arrange source_sqlite
 arrange.source_sqlite <- function(.data, ..., .n = 1e5) {
   assert_that(length(.n) == 1, .n > 0L)
 
@@ -56,7 +55,7 @@ arrange.source_sqlite <- function(.data, ..., .n = 1e5) {
 
 #' @rdname manip_sqlite
 #' @export
-#' @method select source
+#' @method select source_sqlite
 select.source_sqlite <- function(.data, ..., .n = 1e5) {
   assert_that(length(.n) == 1, .n > 0L)
 

diff --git a/R/utils.r b/R/utils.r
@@ -26,3 +26,10 @@ names2 <- function(x) {
 is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) {
   abs(x - round(x)) < tol
 }
+
+as_df <- function(x) {
+  class(x) <- "data.frame"
+  attr(x, "row.names") <- c(NA_integer_, -length(x[[1]]))
+
+  x
+}
diff --git a/bench/df2env.r b/bench/df2env.r
@@ -0,0 +1,53 @@
+# What's the fastest way to get a subset of the colums in
+# a data frame into an environment?
+
+# list2env is extremely fast so - 20 µs for 200 cols
+# and doesn't seem to grow with underlying data
+# so don't worry about it - just use list2env
+
+# Just copy them all
+l1 <- function(df, cols) {
+  list2env(df, parent = emptyenv())
+}
+
+# Subset, then copy
+l2 <- function(df, cols) {
+  list2env(df[cols], parent = emptyenv())
+}
+
+# Copy by hand
+l3 <- function(df, cols) {
+  env <- new.env(parent = emptyenv())
+  for (col in cols) {
+    env[[col]] <- .subset2(df, col)
+  }
+  env
+}
+
+
+library(microbenchmark)
+options(digits = 3)
+data(baseball, package = "plyr")
+
+all <- names(baseball)
+print(microbenchmark(
+  l1(baseball, all),
+  l2(baseball, all),
+  l3(baseball, all)
+))
+
+long <- do.call("rbind", replicate(10, baseball, simplify = FALSE))
+print(microbenchmark(
+  l1(long, all),
+  l3(long, all),
+  l1(long, c("id", "sf")),
+  l3(long, c("id", "sf"))
+))
+
+wide <- do.call("cbind", replicate(10, baseball, simplify = FALSE))
+print(microbenchmark(
+  l1(wide, all),
+  l3(wide, all),
+  l1(wide, c("id", "sf")),
+  l3(wide, c("id", "sf"))
+))
diff --git a/man/desc.Rd b/man/desc.Rd
@@ -0,0 +1,20 @@
+\name{desc}
+\alias{desc}
+\title{Descending order.}
+\usage{
+  desc(x)
+}
+\arguments{
+  \item{x}{vector to transform}
+}
+\description{
+  Transform a vector into a format that will be sorted in
+  descending order.
+}
+\examples{
+desc(1:10)
+desc(factor(letters))
+first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
+desc(first_day)
+}
+
diff --git a/man/manip_data_frame.Rd b/man/manip_data_frame.Rd
@@ -0,0 +1,31 @@
+\name{manip_data_frame}
+\alias{arrange.data.frame}
+\alias{filter.data.frame}
+\alias{manip_data_frame}
+\alias{mutate.data.frame}
+\alias{select.data.frame}
+\alias{summarise.data.frame}
+\title{Data manipulation for data frames.}
+\usage{
+  \method{filter}{data.frame} (.data, ...)
+
+  \method{summarise}{data.frame} (.data, ...)
+
+  \method{mutate}{data.frame} (.data, ...)
+
+  \method{arrange}{data.frame} (.data, ...)
+
+  \method{select}{data.frame} (.data, ...)
+}
+\description{
+  Data manipulation for data frames.
+}
+\examples{
+data("baseball", package = "plyr")
+filter(baseball, year > 2005, g > 130)
+head(select(baseball, id:team))
+summarise(baseball, g = mean(g), n = count())
+head(mutate(baseball, rbi = 1.0 * r / ab))
+head(arrange(baseball, id, desc(year)))
+}
+
diff --git a/man/manip_sqlite.Rd b/man/manip_sqlite.Rd
@@ -7,22 +7,22 @@
 \alias{summarise.source_sqlite}
 \title{Data manipulation for SQL data sources.}
 \usage{
-  \method{filter}{source} (.data, ..., .n = 1e+05)
+  \method{filter}{source_sqlite} (.data, ..., .n = 1e+05)
 
-  \method{summarise}{source} (.data, ..., .n = 1e+05)
+  \method{summarise}{source_sqlite} (.data, ...,
+    .n = 1e+05)
 
-  \method{mutate}{source} (.data, ..., .n = 1e+05)
+  \method{mutate}{source_sqlite} (.data, ..., .n = 1e+05)
 
-  \method{arrange}{source} (.data, ..., .n = 1e+05)
+  \method{arrange}{source_sqlite} (.data, ..., .n = 1e+05)
 
-  \method{select}{source} (.data, ..., .n = 1e+05)
+  \method{select}{source_sqlite} (.data, ..., .n = 1e+05)
 }
 \description{
   Data manipulation for SQL data sources.
 }
 \examples{
 data("baseball", package = "plyr")
-bdf <- data_frame_source(baseball)
 baseball_s <- sqlite_source("inst/db/baseball.sqlite3", "baseball")
 filter(baseball_s, year > 2005, g > 130)
 head(select(baseball_s, id:team))