Skip to content

Commit

Permalink
first pass at adding xlarge tests for performance parity
Browse files Browse the repository at this point in the history
  • Loading branch information
Sebastian Vidrio authored and Sebastian Vidrio committed May 22, 2015
1 parent 873ac68 commit 3663a6b
Show file tree
Hide file tree
Showing 19 changed files with 914 additions and 1 deletion.
49 changes: 49 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 186K rows and 3.2K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
paste("Time it took to parse", parse_time[[1]])

colNames = {}
for(col in names(data.hex)) {
colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
colNames = append(colNames, colName)
}

colNames[1] <- "C1"
names(data.hex) <- colNames

myY = colNames[1]
myX = setdiff(names(data.hex), myY)

# Start modeling
#Deep Learning
dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY, data=data.hex,
epochs=.1, hidden=c(5,5)))
paste("Time it took to build DL ", dl_time[[1]])
data1.dl

PASS_BANNER()
43 changes: 43 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/1Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# DL
dl_time <- system.time(mdl.dl <- h2o.deeplearning(x=predictors, y=response,
data=data.hex, replicate_training_data=FALSE, epochs=.1, hidden=c(5,5)))
mdl.dl
paste("Time it took to build DL ", dl_time[[1]])

PASS_BANNER()

58 changes: 58 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#----------------------------------------------------------------------
# Purpose: This test exercises HDFS operations from R.
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(testthat)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/airlinesbillion.csv"

#----------------------------------------------------------------------
# Single file cases.
#----------------------------------------------------------------------

heading("Testing single file importHDFS")
url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

data1.hex <- data.hex

n <- nrow(data.hex)
print(n)
if (n != 1166952590) {
stop("nrows is wrong")
}

#Constructing validation and train sets by sampling (20/80)
#creating a column as tall as airlines(nrow(air))
s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle
data.train <- data.hex[s <= 0.8,]
data.valid <- data.hex[s > 0.8,]

## Chose which col as response
## Response = IsDepDelayed
myY = "C31"
myX = setdiff(names(data1.hex), myY)

dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY,
data=data.train, validation=data.valid, replicate_training_data=FALSE,
epochs=.1, hidden=c(5,5)))
data1.dl
paste("Time it took to build DL ", dl_time[[1]])

PASS_BANNER()
42 changes: 42 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/15Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# Gradient Boosted Trees
gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response, data=data.hex, distribution = "bernoulli"))
mdl.gbm
paste("Time it took to build GBM ", gbm_time[[1]])

PASS_BANNER()

49 changes: 49 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 186K rows and 3.2K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
paste("Time it took to parse", parse_time[[1]])

colNames = {}
for(col in names(data.hex)) {
colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
colNames = append(colNames, colName)
}

colNames[1] <- "C1"
names(data.hex) <- colNames

myY = colNames[1]
myX = setdiff(names(data.hex), myY)

# Start modeling
#GBM on original dataset
gbm_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, data = data.hex,
n.trees = 10, interaction.depth = 5, distribution = "multinomial"))
paste("Time it took to build GBM ", gbm_time[[1]])
data1.gbm

PASS_BANNER()
43 changes: 43 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/1Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# Gradient Boosted Trees
gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response,
data=data.hex, distribution = "bernoulli"))
mdl.gbm
paste("Time it took to build GBM ", gbm_time[[1]])

PASS_BANNER()

46 changes: 46 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 376K rows and 6.9K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c28")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

h2o.ls(conn)
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c28/mr_output.tsv.sorted.gz"))
paste("Time it took to parse", parse_time[[1]])

dim(data.hex)

s = h2o.runif(data.hex)
train = data.hex[s <= 0.8,]
valid = data.hex[s > 0.8,]

#GBM model
gbm_time <- system.time(model.gbm <- h2o.gbm(x = 3:(ncol(train)), y = 2,
data = train, validation=valid, n.trees=10, interaction.depth=5))
paste("Time it took to build GBM ", gbm_time[[1]])
model.gbm

pred = h2o.predict(model.gbm, valid)
perf <- h2o.performance(pred[,3], valid[,2])

PASS_BANNER()
Loading

0 comments on commit 3663a6b

Please sign in to comment.