Skip to content

Latest commit

 

History

History
224 lines (187 loc) · 4.88 KB

grouped_sum_timing.md

File metadata and controls

224 lines (187 loc) · 4.88 KB

grouped_sum_timing.Rmd

Example using FastBaseR.

library("dplyr")
## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
packageVersion("dplyr")
## [1] '0.7.7'
library("data.table")
## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
packageVersion("data.table")
## [1] '1.11.8'
library("microbenchmark")
library("WVPlots")
library("FastBaseR")

f_base_R_split <- function(data) {
  # first sort the data
  order_index <- with(data, 
                      order(x, y, decreasing = TRUE))
  odata <- data[order_index, , drop = FALSE]
  # now split into groups
  data_list <- split(odata, -odata$x)
  # apply the cumsum to each group
  data_list <- lapply(
    data_list,
    function(di) {
      di$running_y_sum <- cumsum(di$y)
      di
    })
  # put the results back to together
  odata <- do.call(rbind, data_list)
  rownames(odata) <- NULL
  odata
}

f_base_R_running <- function(data) {
  # first sort the data
  order_index <- with(data, order(x, y, decreasing = TRUE))
  odata <- data[order_index, , drop = FALSE]
  rownames(odata) <- NULL
  first_indices <- mark_first_in_each_group(odata, "x")
  odata$running_y_sum <- cumsum_g(odata$y, first_indices)
  odata
}


f_data.table <- function(data) {
  data_data.table <- as.data.table(data)
  
  # sort data
  setorderv(data_data.table, c("x", "y"), order = -1L)
  # apply operation in each x-defined group
  data_data.table[ , running_y_sum := cumsum(y), by = "x"]
  
  data_data.table[]
}

f_dplyr <- function(data) {
  data %>%
    arrange(., desc(x), desc(y)) %>%
    group_by(., x) %>%
    mutate(., running_y_sum = cumsum(y)) %>%
    ungroup(.)
}

data <- wrapr::build_frame(
   "x", "y" |
   1  , 1   |
   0  , 0   |
   1  , 0   |
   0  , 1   |
   0  , 0   |
   1  , 1   )

my_check <- function(values) {
  v1 <- data.frame(values[[1]])
  all(vapply(values[-1], 
             function(x) {
               isTRUE(all.equal(v1, data.frame(x)))
             },
             logical(1)))
}

lst <- list( 
  base_R_split = f_base_R_split(data),
  base_R_running = f_base_R_running(data),
  data.table = f_data.table(data),
  dplyr = f_dplyr(data))

print(lst)
## $base_R_split
##   x y running_y_sum
## 1 1 1             1
## 2 1 1             2
## 3 1 0             2
## 4 0 1             1
## 5 0 0             1
## 6 0 0             1
## 
## $base_R_running
##   x y running_y_sum
## 1 1 1             1
## 2 1 1             2
## 3 1 0             2
## 4 0 1             1
## 5 0 0             1
## 6 0 0             1
## 
## $data.table
##    x y running_y_sum
## 1: 1 1             1
## 2: 1 1             2
## 3: 1 0             2
## 4: 0 1             1
## 5: 0 0             1
## 6: 0 0             1
## 
## $dplyr
## # A tibble: 6 x 3
##       x     y running_y_sum
##   <dbl> <dbl>         <dbl>
## 1     1     1             1
## 2     1     1             2
## 3     1     0             2
## 4     0     1             1
## 5     0     0             1
## 6     0     0             1
my_check(lst)
## [1] TRUE
nrow <- 1000000
nsym <- 100000
set.seed(235236)
data <- data.frame(x = sample.int(nsym, nrow, replace = TRUE))
data$y <- rnorm(nrow(data))

lst <- list( 
  base_R_split = f_base_R_split(data),
  base_R_running = f_base_R_running(data),
  data.table = f_data.table(data),
  dplyr = f_dplyr(data))
my_check(lst)
## [1] TRUE
lst <- NULL


timing <- microbenchmark(
  base_R_split = f_base_R_split(data),
  base_R_running = f_base_R_running(data),
  data.table = f_data.table(data),
  dplyr = f_dplyr(data),
  times = 10L
)

print(timing)
## Unit: milliseconds
##            expr        min         lq       mean     median         uq
##    base_R_split 12650.9977 14041.3320 14417.0595 14341.4191 15309.4664
##  base_R_running   400.1429   422.6459   652.3276   510.1481   859.6739
##      data.table   158.5339   163.0760   195.0300   164.5713   184.1905
##           dplyr  2227.6616  2296.3252  2638.2539  2380.4040  2974.8971
##         max neval cld
##  16411.9355    10   c
##   1169.7952    10 a  
##    404.3589    10 a  
##   3653.6679    10  b
tm <- as.data.frame(timing)
tm$seconds <- tm$time/1e+9
tm$method <- factor(tm$expr)
tm$method <- reorder(tm$method, tm$seconds)
ScatterBoxPlotH(tm, 
                "seconds", "method", 
                "task time by method")