Merge branch 'master' of https://github.com/h2oai/h2o

narenreddy · Jun 8, 2015 · b88db1e · b88db1e
2 parents 93f1f5f + 7ab449c
commit b88db1e
Show file tree

Hide file tree

Showing 34 changed files with 1,116 additions and 49 deletions.
diff --git a/R/ensemble/README.md b/R/ensemble/README.md
@@ -13,7 +13,7 @@ R CMD INSTALL h2o/R/ensemble/h2oEnsemble-package
 - Install in R using `devtools::install_github`:
 ```
 library(devtools)
-install_github("h2oai/h2o/R/ensemble/h2oEnsemble-package")
+install_github("h2oai/h2o-2/R/ensemble/h2oEnsemble-package")
 ```
 
 ## Create Ensembles

diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R
@@ -813,7 +813,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
   noGrid <- noGrid && (missing(override_with_best_model) || length(override_with_best_model) == 1)
   noGrid <- noGrid && (missing(seed) || length(seed) == 1)
   noGrid <- noGrid && (missing(input_dropout_ratio) || length(input_dropout_ratio) == 1)
-  noGrid <- noGrid && (missing(hidden_dropout_ratios) || (!is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1))
+  noGrid <- noGrid && (missing(hidden_dropout_ratios) || !(is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1))
   noGrid <- noGrid && (missing(max_w2) || length(max_w2) == 1)
   noGrid <- noGrid && (missing(initial_weight_distribution) || length(initial_weight_distribution) == 1)
   noGrid <- noGrid && (missing(initial_weight_scale) || length(initial_weight_scale) == 1)

diff --git a/R/h2o-package/R/Internal.R b/R/h2o-package/R/Internal.R
@@ -116,6 +116,7 @@ h2o.setLogPath <- function(path, type) {
 .h2o.__SET_DOMAIN = "2/SetDomains.json"
 .h2o.__PAGE_ALLMODELS = "2/Models.json"
 .h2o.__GAINS <- "2/GainsLiftTable.json"
+.h2o.__PAGE_GARBAGECOLLECT = "GarbageCollect.json"
 
 .h2o.__PAGE_IMPUTE= "2/Impute.json"
 .h2o.__PAGE_EXEC2 = "2/Exec2.json"
@@ -1054,3 +1055,13 @@ h2o.getFrame <- function(h2o, key) {
            "gamma" = Gamma(link))
   }
 }
+
+#
+# This function is internal intentionally.
+#
+# Call it as:
+#    h2o:::.h2o.garbageCollect(localH2O)
+#
+.h2o.garbageCollect <- function(client) {
+  res = .h2o.__remoteSend(client, .h2o.__PAGE_GARBAGECOLLECT)
+}
diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building GLM/GBM/DL  model
+#           for 186K rows and 3.2K columns
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
+paste("Time it took to parse", parse_time[[1]])
+
+colNames = {}
+for(col in names(data.hex)) {
+    colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
+    colNames = append(colNames, colName)
+}
+
+colNames[1] <- "C1"
+names(data.hex) <- colNames
+
+myY = colNames[1]
+myX = setdiff(names(data.hex), myY)
+
+# Start modeling
+#Deep Learning
+dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY, data=data.hex,
+  epochs=.1, hidden=c(5,5)))
+paste("Time it took to build DL ", dl_time[[1]])
+data1.dl
+
+PASS_BANNER()
diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R
@@ -0,0 +1,43 @@
+
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building 15MRows2KCols
+#
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+hdfs_data_file = "/datasets/1Mx2.2k.csv"
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+
+url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
+parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
+paste("Time it took to parse", parse_time[[1]])
+
+response=1 #1:1000 imbalance
+predictors=c(3:ncol(data.hex))
+
+# Start modeling
+# DL
+dl_time <- system.time(mdl.dl <- h2o.deeplearning(x=predictors, y=response,
+  data=data.hex, replicate_training_data=FALSE, epochs=.1, hidden=c(5,5)))
+mdl.dl
+paste("Time it took to build DL ", dl_time[[1]])
+
+PASS_BANNER()
+
diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R
@@ -0,0 +1,58 @@
+#----------------------------------------------------------------------
+# Purpose:  This test exercises HDFS operations from R.
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(testthat)
+library(h2o)
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+hdfs_data_file = "/datasets/airlinesbillion.csv"
+
+#----------------------------------------------------------------------
+# Single file cases.
+#----------------------------------------------------------------------
+
+heading("Testing single file importHDFS")
+url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
+parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
+paste("Time it took to parse", parse_time[[1]])
+
+data1.hex <- data.hex
+
+n <- nrow(data.hex)
+print(n)
+if (n != 1166952590) {
+    stop("nrows is wrong")
+}
+
+#Constructing validation and train sets by sampling (20/80)
+#creating a column as tall as airlines(nrow(air))
+s <- h2o.runif(data.hex)    # Useful when number of rows too large for R to handle
+data.train <- data.hex[s <= 0.8,]
+data.valid <- data.hex[s > 0.8,]
+
+## Chose which col as response
+## Response = IsDepDelayed
+myY = "C31"
+myX = setdiff(names(data1.hex), myY)
+
+dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY,
+  data=data.train, validation=data.valid, replicate_training_data=FALSE,
+  epochs=.1, hidden=c(5,5)))
+data1.dl
+paste("Time it took to build DL ", dl_time[[1]])
+
+PASS_BANNER()
diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R
@@ -0,0 +1,42 @@
+
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building 15MRows2KCols
+#
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+hdfs_data_file = "/datasets/15Mx2.2k.csv"
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+
+url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
+parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
+paste("Time it took to parse", parse_time[[1]])
+
+response=1 #1:1000 imbalance
+predictors=c(3:ncol(data.hex))
+
+# Start modeling
+# Gradient Boosted Trees
+gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response, data=data.hex, distribution = "bernoulli"))
+mdl.gbm
+paste("Time it took to build GBM ", gbm_time[[1]])
+
+PASS_BANNER()
+
diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building GLM/GBM/DL  model
+#           for 186K rows and 3.2K columns
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
+paste("Time it took to parse", parse_time[[1]])
+
+colNames = {}
+for(col in names(data.hex)) {
+    colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
+    colNames = append(colNames, colName)
+}
+
+colNames[1] <- "C1"
+names(data.hex) <- colNames
+
+myY = colNames[1]
+myX = setdiff(names(data.hex), myY)
+
+# Start modeling
+#GBM on original dataset
+gbm_time <- system.time(data1.gbm <-  h2o.gbm(x = myX, y = myY, data = data.hex,
+  n.trees = 10, interaction.depth = 5, distribution = "multinomial"))
+paste("Time it took to build GBM ", gbm_time[[1]])
+data1.gbm
+
+PASS_BANNER()
diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R
@@ -0,0 +1,43 @@
+
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building 15MRows2KCols
+#
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+hdfs_data_file = "/datasets/1Mx2.2k.csv"
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+
+url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
+parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
+paste("Time it took to parse", parse_time[[1]])
+
+response=1 #1:1000 imbalance
+predictors=c(3:ncol(data.hex))
+
+# Start modeling
+# Gradient Boosted Trees
+gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response,
+  data=data.hex, distribution = "bernoulli"))
+mdl.gbm
+paste("Time it took to build GBM ", gbm_time[[1]])
+
+PASS_BANNER()
+
diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R
@@ -0,0 +1,46 @@
+#----------------------------------------------------------------------
+# Purpose:  This test exercises building GLM/GBM/DL  model
+#           for 376K rows and 6.9K columns
+#----------------------------------------------------------------------
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+ipPort <- get_args(commandArgs(trailingOnly = TRUE))
+myIP   <- ipPort[[1]]
+myPort <- ipPort[[2]]
+hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
+print(hdfs_name_node)
+
+library(RCurl)
+library(h2o)
+
+running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c28")
+
+heading("BEGIN TEST")
+conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
+h2o.removeAll()
+
+h2o.ls(conn)
+#----------------------------------------------------------------------
+# Parameters for the test.
+#----------------------------------------------------------------------
+parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c28/mr_output.tsv.sorted.gz"))
+paste("Time it took to parse", parse_time[[1]])
+
+dim(data.hex)
+
+s = h2o.runif(data.hex)
+train = data.hex[s <= 0.8,]
+valid = data.hex[s > 0.8,]
+
+#GBM model
+gbm_time <- system.time(model.gbm <- h2o.gbm(x = 3:(ncol(train)), y = 2,
+  data = train, validation=valid, n.trees=10, interaction.depth=5))
+paste("Time it took to build GBM ", gbm_time[[1]])
+model.gbm
+
+pred = h2o.predict(model.gbm, valid)
+perf <- h2o.performance(pred[,3], valid[,2])
+
+PASS_BANNER()