Skip to content

Commit

Permalink
airlines billion GLM test
Browse files Browse the repository at this point in the history
  • Loading branch information
nmadabhushi committed May 19, 2015
1 parent a6592de commit 9cec825
Showing 1 changed file with 58 additions and 0 deletions.
58 changes: 58 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GLM_airlines_billion_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#----------------------------------------------------------------------
# Purpose: This test exercises HDFS operations from R.
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(testthat)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)

hdfs_data_file = "/datasets/airlinesbillion.csv"

#----------------------------------------------------------------------
# Single file cases.
#----------------------------------------------------------------------

heading("Testing single file importHDFS")
url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

data1.hex <- data.hex

n <- nrow(data.hex)
print(n)
if (n != 1166952590) {
stop("nrows is wrong")
}

#Constructing validation and train sets by sampling (20/80)
#creating a column as tall as airlines(nrow(air))
s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle
data.train <- data.hex[s <= 0.8,]
data.valid <- data.hex[s > 0.8,]

## Response = Distance

myY = "C19"
#myX = setdiff(names(data.hex), c(myY, ""))
myX = c("C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27", "C28", "C29")
## Build GLM Model and compare AUC with h2o1

#glm_irlsm_time <- system.time(data_irlsm.glm <- h2o.glm(x = myX, y = myY, data = data.train, validation=data.valid, family = "gaussian", solver = "IRLSM"))
glm_time <- system.time(data.glm <- h2o.glm(x = myX, y = myY, data = data.train, family = "gaussian"))
data.glm
paste("Time it took to build GLM ", glm_time[[1]])

PASS_BANNER()

0 comments on commit 9cec825

Please sign in to comment.