diff --git a/R/ensemble/README.md b/R/ensemble/README.md index ce9e3c4292..66b66f1dd2 100644 --- a/R/ensemble/README.md +++ b/R/ensemble/README.md @@ -13,7 +13,7 @@ R CMD INSTALL h2o/R/ensemble/h2oEnsemble-package - Install in R using `devtools::install_github`: ``` library(devtools) -install_github("h2oai/h2o/R/ensemble/h2oEnsemble-package") +install_github("h2oai/h2o-2/R/ensemble/h2oEnsemble-package") ``` ## Create Ensembles diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R index 90ff5df6e2..fe8e7dc88e 100755 --- a/R/h2o-package/R/Algorithms.R +++ b/R/h2o-package/R/Algorithms.R @@ -813,7 +813,7 @@ h2o.deeplearning <- function(x, y, data, key = "", noGrid <- noGrid && (missing(override_with_best_model) || length(override_with_best_model) == 1) noGrid <- noGrid && (missing(seed) || length(seed) == 1) noGrid <- noGrid && (missing(input_dropout_ratio) || length(input_dropout_ratio) == 1) - noGrid <- noGrid && (missing(hidden_dropout_ratios) || (!is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1)) + noGrid <- noGrid && (missing(hidden_dropout_ratios) || !(is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1)) noGrid <- noGrid && (missing(max_w2) || length(max_w2) == 1) noGrid <- noGrid && (missing(initial_weight_distribution) || length(initial_weight_distribution) == 1) noGrid <- noGrid && (missing(initial_weight_scale) || length(initial_weight_scale) == 1) diff --git a/R/h2o-package/R/Internal.R b/R/h2o-package/R/Internal.R index b181ada3a1..53cca282f0 100644 --- a/R/h2o-package/R/Internal.R +++ b/R/h2o-package/R/Internal.R @@ -116,6 +116,7 @@ h2o.setLogPath <- function(path, type) { .h2o.__SET_DOMAIN = "2/SetDomains.json" .h2o.__PAGE_ALLMODELS = "2/Models.json" .h2o.__GAINS <- "2/GainsLiftTable.json" +.h2o.__PAGE_GARBAGECOLLECT = "GarbageCollect.json" .h2o.__PAGE_IMPUTE= "2/Impute.json" .h2o.__PAGE_EXEC2 = "2/Exec2.json" @@ -1054,3 +1055,13 @@ h2o.getFrame <- function(h2o, key) { "gamma" = Gamma(link)) } } + +# +# This function is internal intentionally. +# +# Call it as: +# h2o:::.h2o.garbageCollect(localH2O) +# +.h2o.garbageCollect <- function(client) { + res = .h2o.__remoteSend(client, .h2o.__PAGE_GARBAGECOLLECT) +} diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R new file mode 100644 index 0000000000..5da9f920c5 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R @@ -0,0 +1,49 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises building GLM/GBM/DL model +# for 186K rows and 3.2K columns +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv") + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- +parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T)) +paste("Time it took to parse", parse_time[[1]]) + +colNames = {} +for(col in names(data.hex)) { + colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col)) + colNames = append(colNames, colName) +} + +colNames[1] <- "C1" +names(data.hex) <- colNames + +myY = colNames[1] +myX = setdiff(names(data.hex), myY) + +# Start modeling +#Deep Learning +dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY, data=data.hex, + epochs=.1, hidden=c(5,5))) +paste("Time it took to build DL ", dl_time[[1]]) +data1.dl + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..77ed2d7b00 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/1Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +# DL +dl_time <- system.time(mdl.dl <- h2o.deeplearning(x=predictors, y=response, + data=data.hex, replicate_training_data=FALSE, epochs=.1, hidden=c(5,5))) +mdl.dl +paste("Time it took to build DL ", dl_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R new file mode 100644 index 0000000000..e657e5a045 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R @@ -0,0 +1,58 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Chose which col as response +## Response = IsDepDelayed +myY = "C31" +myX = setdiff(names(data1.hex), myY) + +dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY, + data=data.train, validation=data.valid, replicate_training_data=FALSE, + epochs=.1, hidden=c(5,5))) +data1.dl +paste("Time it took to build DL ", dl_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..229a0a2a81 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R @@ -0,0 +1,42 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/15Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +# Gradient Boosted Trees +gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response, data=data.hex, distribution = "bernoulli")) +mdl.gbm +paste("Time it took to build GBM ", gbm_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R new file mode 100644 index 0000000000..45efbd64b7 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R @@ -0,0 +1,49 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises building GLM/GBM/DL model +# for 186K rows and 3.2K columns +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv") + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- +parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T)) +paste("Time it took to parse", parse_time[[1]]) + +colNames = {} +for(col in names(data.hex)) { + colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col)) + colNames = append(colNames, colName) +} + +colNames[1] <- "C1" +names(data.hex) <- colNames + +myY = colNames[1] +myX = setdiff(names(data.hex), myY) + +# Start modeling +#GBM on original dataset +gbm_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, data = data.hex, + n.trees = 10, interaction.depth = 5, distribution = "multinomial")) +paste("Time it took to build GBM ", gbm_time[[1]]) +data1.gbm + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..a78e9e8fb3 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/1Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +# Gradient Boosted Trees +gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response, + data=data.hex, distribution = "bernoulli")) +mdl.gbm +paste("Time it took to build GBM ", gbm_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R new file mode 100644 index 0000000000..8323044261 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R @@ -0,0 +1,46 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises building GLM/GBM/DL model +# for 376K rows and 6.9K columns +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c28") + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +h2o.ls(conn) +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- +parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c28/mr_output.tsv.sorted.gz")) +paste("Time it took to parse", parse_time[[1]]) + +dim(data.hex) + +s = h2o.runif(data.hex) +train = data.hex[s <= 0.8,] +valid = data.hex[s > 0.8,] + +#GBM model +gbm_time <- system.time(model.gbm <- h2o.gbm(x = 3:(ncol(train)), y = 2, + data = train, validation=valid, n.trees=10, interaction.depth=5)) +paste("Time it took to build GBM ", gbm_time[[1]]) +model.gbm + +pred = h2o.predict(model.gbm, valid) +perf <- h2o.performance(pred[,3], valid[,2]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_AUTO_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_AUTO_airlines_billion_xlarge.R new file mode 100644 index 0000000000..5348dc974e --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_AUTO_airlines_billion_xlarge.R @@ -0,0 +1,63 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Chose which col as response +## Response = IsDepDelayed +myY = "C31" +myX = setdiff(names(data1.hex), myY) +gbm_10tree_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, + data = data.train, validation=data.valid, n.trees = 10, interaction.depth = 5, + distribution = "AUTO")) +data1.gbm +paste("Time it took to build GBM ", gbm_10tree_time[[1]]) + +gbm_50tree_time <- system.time(data2.gbm <- h2o.gbm(x = myX, y = myY, + data = data.train, validation=data.valid, n.trees = 50, interaction.depth = 5, + distribution = "AUTO")) +data2.gbm +paste("Time it took to build GBM ", gbm_50tree_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_Bernoulli_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_Bernoulli_airlines_billion_xlarge.R new file mode 100644 index 0000000000..543d91e75c --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_Bernoulli_airlines_billion_xlarge.R @@ -0,0 +1,57 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Chose which col as response +## Response = IsDepDelayed +myY = "C31" +myX = setdiff(names(data1.hex), myY) +gbm_10tree_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, + data = data.train, validation=data.valid, n.trees = 10, interaction.depth = 5, + distribution = "bernoulli")) +data1.gbm +paste("Time it took to build GBM ", gbm_10tree_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GBM_Multinomial_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GBM_Multinomial_airlines_billion_xlarge.R new file mode 100644 index 0000000000..07dfe6af4e --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GBM_Multinomial_airlines_billion_xlarge.R @@ -0,0 +1,57 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Chose which col as response +## Response = IsDepDelayed +myY = "C31" +myX = setdiff(names(data1.hex), myY) +gbm_10tree_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, + data = data.train, validation=data.valid, n.trees = 10, interaction.depth = 5, + distribution = "multinomial")) +data1.gbm +paste("Time it took to build GBM ", gbm_10tree_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GLM_15MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GLM_15MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..e0f64671cd --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GLM_15MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/15Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +# GLM +glm_time <- system.time(mdl.glm <- h2o.glm(x=predictors, y=response, + data=data.hex, family = "binomial")) +mdl.glm +paste("Time it took to build GLM ", glm_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_GLM_186KRows_3.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GLM_186KRows_3.2KCols_xlarge.R new file mode 100644 index 0000000000..0894b12bc4 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GLM_186KRows_3.2KCols_xlarge.R @@ -0,0 +1,49 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises building GLM/GBM/DL model +# for 186K rows and 3.2K columns +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv") + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- +parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T)) +paste("Time it took to parse", parse_time[[1]]) + +colNames = {} +for(col in names(data.hex)) { + colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col)) + colNames = append(colNames, colName) +} + +colNames[1] <- "C1" +names(data.hex) <- colNames + +myY = colNames[1] +myX = setdiff(names(data.hex), myY) + +# Start modeling +# GLM +glm_time <- system.time(data1.glm <- h2o.glm(x=myX, y=myY, data = data.hex, + family="gaussian")) +data1.glm +paste("Time it took to build GLM ", glm_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GLM_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GLM_1MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..5f78147909 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GLM_1MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/1Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +# GLM +glm_time <- system.time(mdl.glm <- h2o.glm(x=predictors, y=response, + data=data.hex, family = "binomial")) +mdl.glm +paste("Time it took to build GLM ", glm_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_GLM_376KRows_6KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GLM_376KRows_6KCols_xlarge.R new file mode 100644 index 0000000000..1d67c17996 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GLM_376KRows_6KCols_xlarge.R @@ -0,0 +1,46 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises building GLM/GBM/DL model +# for 376K rows and 6.9K columns +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c28") + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +h2o.ls(conn) +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- +parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c28/mr_output.tsv.sorted.gz")) +paste("Time it took to parse", parse_time[[1]]) + +dim(data.hex) + +s = h2o.runif(data.hex) +train = data.hex[s <= 0.8,] +valid = data.hex[s > 0.8,] + +#GLM Model +glm_time <- system.time(model.glm <- h2o.glm(x = 3:(ncol(train)), y = 6, + data = train, validation=valid, family = "binomial")) +paste("Time it took to build GLM ", glm_time[[1]]) +model.glm + +pred = h2o.predict(model.glm, valid) +perf <- h2o.performance(pred[,3], valid[,6]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_GLM_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_GLM_airlines_billion_xlarge.R new file mode 100644 index 0000000000..a9ab8bb7b7 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_GLM_airlines_billion_xlarge.R @@ -0,0 +1,58 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Response = Distance + +myY = "C19" +#myX = setdiff(names(data.hex), c(myY, "")) +myX = c("C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27", "C28", "C29") +## Build GLM Model and compare AUC with h2o1 + +#glm_irlsm_time <- system.time(data_irlsm.glm <- h2o.glm(x = myX, y = myY, data = data.train, validation=data.valid, family = "gaussian", solver = "IRLSM")) +glm_time <- system.time(data.glm <- h2o.glm(x = myX, y = myY, data = data.train, family = "gaussian")) +data.glm +paste("Time it took to build GLM ", glm_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_RF_15MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_RF_15MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..831264bb67 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_RF_15MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/15Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +#Random Forest +rf_time <- system.time(mdl.rf <- h2o.randomForest(x=predictors, y=response, + data=data.hex, ntree=10, depth=5)) +mdl.rf +paste("Time it took to build RF ", rf_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_RF_1MRows_2.2KCols_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_RF_1MRows_2.2KCols_xlarge.R new file mode 100644 index 0000000000..e683dac351 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_RF_1MRows_2.2KCols_xlarge.R @@ -0,0 +1,43 @@ + +#---------------------------------------------------------------------- +# Purpose: This test exercises building 15MRows2KCols +# +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/1Mx2.2k.csv" +#---------------------------------------------------------------------- +# Parameters for the test. +#---------------------------------------------------------------------- + +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +response=1 #1:1000 imbalance +predictors=c(3:ncol(data.hex)) + +# Start modeling +#Random Forest +rf_time <- system.time(mdl.rf <- h2o.randomForest(x=predictors, y=response, + data=data.hex, n.tree=10, interaction.depth=5)) +mdl.rf +paste("Time it took to build RF ", rf_time[[1]]) + +PASS_BANNER() + diff --git a/R/tests/testdir_hdfs_xlarge/runit_RF_airlines_billion_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_RF_airlines_billion_xlarge.R new file mode 100644 index 0000000000..4d9dce9be2 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_RF_airlines_billion_xlarge.R @@ -0,0 +1,59 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_data_file = "/datasets/airlinesbillion.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file) +parse_time <- system.time(data.hex <- h2o.importFile(conn, url)) +paste("Time it took to parse", parse_time[[1]]) + +data1.hex <- data.hex + +n <- nrow(data.hex) +print(n) +if (n != 1166952590) { + stop("nrows is wrong") +} + +#Constructing validation and train sets by sampling (20/80) +#creating a column as tall as airlines(nrow(air)) +s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle +data.train <- data.hex[s <= 0.8,] +data.valid <- data.hex[s > 0.8,] + +## Chose which col as response +## Response = IsDepDelayed +myY = "C31" +# myX = setdiff(names(data1.hex), myY) +myX = c("C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27", "C28", "C29") + +rf_time <- system.time(data1.rf <- h2o.randomForest(x = myX, y = myY, + data = data.train, validation=data.valid, ntree = 10, depth = 5, + type = "BigData")) +data1.rf +paste("Time it took to build RF ", rf_time[[1]]) + +PASS_BANNER() diff --git a/R/tests/testdir_hdfs_xlarge/runit_hadoop_airlines_xlarge.R b/R/tests/testdir_hdfs_xlarge/runit_hadoop_airlines_xlarge.R new file mode 100644 index 0000000000..6e882d3ee8 --- /dev/null +++ b/R/tests/testdir_hdfs_xlarge/runit_hadoop_airlines_xlarge.R @@ -0,0 +1,80 @@ +#---------------------------------------------------------------------- +# Purpose: This test exercises HDFS operations from R. +#---------------------------------------------------------------------- + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +ipPort <- get_args(commandArgs(trailingOnly = TRUE)) +myIP <- ipPort[[1]] +myPort <- ipPort[[2]] +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +library(RCurl) +library(testthat) +library(h2o) + +heading("BEGIN TEST") +conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE) +h2o.removeAll() + +hdfs_airlines_file = "/datasets/airlines_all.csv" + +#---------------------------------------------------------------------- +# Single file cases. +#---------------------------------------------------------------------- + +heading("Testing single file importHDFS") +url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_airlines_file) +data.hex <- h2o.importFile(conn, url) + +n <- nrow(data.hex) +print(n) +if (n != 116695259) { + stop("nrows is wrong") +} + +if (class(data.hex) != "H2OFrame") { + stop("data.hex is the wrong type") +} +print ("Import worked") + +## First choose columns to ignore +IgnoreCols <- c('DepTime','ArrTime','FlightNum','TailNum','ActualElapsedTime','AirTime','ArrDelay','DepDelay','TaxiIn','TaxiOut','Cancelled','CancellationCode','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay','Diverted') + +## Then remove those cols from validX list +myX <- which(!(names(data.hex) %in% IgnoreCols)) + +## Chose which col as response +DepY <- "IsDepDelayed" + +# Chose functions glm, gbm, deeplearning +# obj name | function call | x = predictors | y = response | training_frame = airlines +# + +## Build GLM Model and compare AUC with h2o1 +air.glm <- h2o.glm(x = myX, y = DepY, data = data.hex, family = "binomial") +pred_glm = h2o.predict(air.glm, data.hex) +auc_glm <- h2o.performance(pred_glm[,3], data.hex[ ,DepY], measure = "auc") +print(auc_glm) +expect_true(abs(auc_glm - 0.79) < 0.01) + +IgnoreCols_1 <- c('Year','Month','DayofMonth','DepTime','DayOfWeek','ArrTime','TailNum','ActualElapsedTime','AirTime','ArrDelay','DepDelay','TaxiIn','TaxiOut','Cancelled','CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay') + +## Then remove those cols from validX list +myX1 <- which(!(names(data.hex) %in% IgnoreCols_1)) + +air.gbm <- h2o.gbm(x = myX1, y = DepY, data = data.hex, distribution = "bernoulli", ntrees=50) +pred_gbm = h2o.predict(air.gbm, data.hex) +auc_gbm <- h2o.performance(pred_gbm[,3], data.hex[ ,DepY], measure = "auc") +print(auc_gbm) +expect_true(abs(auc_gbm - 0.80) < 0.01) + +air.dl <- h2o.deeplearning(x = myX1, y = DepY, data = data.hex, epochs=1, hidden=c(50,50), loss = "CrossEntropy") +pred_dl = h2o.predict(air.dl, data.hex) +auc_dl <- h2o.performance(pred_dl[,3], data.hex[ ,DepY], measure = "auc") +print(auc_dl) +expect_true(abs(auc_dl - 0.80) <= 0.02) + +PASS_BANNER() diff --git a/R/tests/testdir_javapredict/runit_SRF_javapredict_chess_2x2x1000_rand.R b/R/tests/testdir_javapredict/runit_NOPASS_SRF_javapredict_chess_2x2x1000_rand.R similarity index 100% rename from R/tests/testdir_javapredict/runit_SRF_javapredict_chess_2x2x1000_rand.R rename to R/tests/testdir_javapredict/runit_NOPASS_SRF_javapredict_chess_2x2x1000_rand.R diff --git a/R/tests/testdir_jira/runit_hex_2022_prior_constraints.R b/R/tests/testdir_jira/runit_hex_2022_prior_constraints.R index a9fe6718ad..d9c12ddf66 100644 --- a/R/tests/testdir_jira/runit_hex_2022_prior_constraints.R +++ b/R/tests/testdir_jira/runit_hex_2022_prior_constraints.R @@ -15,6 +15,7 @@ test.Priors.BetaConstraints <- function(conn) { modelStack = h2o.importFile(conn, pathToFile) betaConstraints.hex = h2o.importFile(conn, pathToConstraints) beta_nointercept.hex <- betaConstraints.hex[1:nrow(betaConstraints.hex)-1,] + beta_nointercept.hex ## Set Parameters (default standardization = T) betaConstraints = as.data.frame(betaConstraints.hex) diff --git a/scripts/validate_r_cmd_check_output.py b/scripts/validate_r_cmd_check_output.py index 500834d951..f1e3210c82 100644 --- a/scripts/validate_r_cmd_check_output.py +++ b/scripts/validate_r_cmd_check_output.py @@ -49,7 +49,10 @@ def process(self): r"^Old maintainer\(s\):", r"^\s*Anqi Fu", r"^NOTE: There was 1 note.", + r"^The Title field starts with the package name.", + r"^The Date field is over a month old.", + r"^\n", r"^New submission", r"^Package was archived on CRAN", @@ -65,6 +68,9 @@ def process(self): r"^Package has FOSS license, installs .class/.jar but has no 'java' directory.", r"^\* DONE", + + r"^Checking URLs requires 'libcurl' support in the R build", + r"^Status: 2 NOTEs", ] s = f.readline() diff --git a/src/main/java/hex/gbm/DHistogram.java b/src/main/java/hex/gbm/DHistogram.java index 4410184532..5ce7387989 100755 --- a/src/main/java/hex/gbm/DHistogram.java +++ b/src/main/java/hex/gbm/DHistogram.java @@ -73,6 +73,7 @@ public void setMax( float max ) { old = _maxIn; } + private static int MAX_FACTOR_BINS=1024; // Allow more bins for factors public DHistogram( String name, final int nbins, final byte isInt, final float min, final float maxEx, long nelems, int min_rows, boolean doGrpSplit ) { assert nelems > 0; assert nbins >= 1; @@ -88,17 +89,15 @@ public DHistogram( String name, final int nbins, final byte isInt, final float m // See if we can show there are fewer unique elements than nbins. // Common for e.g. boolean columns, or near leaves. int xbins = nbins; - float step; - if( isInt>0 && maxEx-min <= nbins ) { + if( isInt>0 && maxEx-min <= Math.max(nbins,(isInt==2?MAX_FACTOR_BINS:nbins)) ) { assert ((long)min)==min; // No overflow xbins = (char)((long)maxEx-(long)min); // Shrink bins assert xbins > 1; // Caller ensures enough range to bother - step = 1.0f; // Fixed stepsize + _step = 1.0f; // Fixed stepsize } else { - step = (maxEx-min)/nbins; // Step size for linear interpolation - assert step > 0; + _step = nbins/(maxEx-min); + assert _step > 0 && !Float.isInfinite(_step); } - _step = 1.0f/step; // Use multiply instead of division during frequent binning math _nbin = (char)xbins; // Do not allocate the big arrays here; wait for scoreCols to pick which cols will be used. } @@ -174,7 +173,8 @@ void add( TDH dsh ) { static public float find_maxEx(float maxIn, int isInt ) { float ulp = Math.ulp(maxIn); if( isInt > 0 && 1 > ulp ) ulp = 1; - return maxIn+ulp; + float res = maxIn+ulp; + return Float.isInfinite(res) ? maxIn : res; } // Compute a "score" for a column; lower score "wins" (is a better split). diff --git a/src/main/java/hex/gbm/GBM.java b/src/main/java/hex/gbm/GBM.java index 5303f0c0f0..0da35c3469 100755 --- a/src/main/java/hex/gbm/GBM.java +++ b/src/main/java/hex/gbm/GBM.java @@ -143,8 +143,8 @@ private GBMModel(GBMModel prior, Key[][] treeKeys, double[] errs, ConfusionMatri if(family == Family.bernoulli) { bodyCtxSB.i().p("// Compute Probabilities for Bernoulli 0-1 classifier").nl(); bodyCtxSB.i().p("double fx = preds[1] + "+initialPrediction+";").nl(); - bodyCtxSB.i().p("preds[2] = 1.0f/(float)(1.0f+Math.exp(-fx))").nl(); - bodyCtxSB.i().p("preds[1] = 1.0f-preds[2]").nl(); + bodyCtxSB.i().p("preds[2] = 1.0f/(float)(1.0f+Math.exp(-fx));").nl(); + bodyCtxSB.i().p("preds[1] = 1.0f-preds[2];").nl(); } else if (isClassifier()) { bodyCtxSB.i().p("// Compute Probabilities for classifier (scale via http://www.hongliangjie.com/2011/01/07/logsum/)").nl(); diff --git a/src/main/java/water/Model.java b/src/main/java/water/Model.java index 7cd5e2f29c..0d1e9e790f 100644 --- a/src/main/java/water/Model.java +++ b/src/main/java/water/Model.java @@ -626,7 +626,7 @@ public SB toJava( SB sb ) { sb.nl(); sb.p("public class ").p(modelName).p(" extends water.genmodel.GeneratedModel {").nl(); // or extends GenerateModel toJavaInit(sb, fileContextSB).nl(); - toJavaNAMES(sb); + toJavaNAMES(sb, fileContextSB); toJavaNCLASSES(sb); toJavaDOMAINS(sb, fileContextSB); toJavaPROB(sb); @@ -636,15 +636,7 @@ public SB toJava( SB sb ) { sb.p(fileContextSB).nl(); // Append file return sb; } - // Same thing as toJava, but as a Javassist CtClass - private CtClass makeCtClass() throws CannotCompileException { - CtClass clz = ClassPool.getDefault().makeClass(JCodeGen.toJavaId(_key.toString())); - clz.addField(CtField.make(toJavaNAMES (new SB()).toString(),clz)); - clz.addField(CtField.make(toJavaNCLASSES(new SB()).toString(),clz)); - toJavaInit(clz); // Model-specific top-level goodness - clz.addMethod(CtMethod.make(toJavaPredict(new SB(), new SB()).toString(),clz)); // FIX ME - return clz; - } + /** Generate implementation for super class. */ protected SB toJavaSuper( SB sb ) { sb.nl(); @@ -656,7 +648,15 @@ protected SB toJavaSuper( SB sb ) { return sb; } - private SB toJavaNAMES( SB sb ) { return JCodeGen.toStaticVar(sb, "NAMES", _names, "Names of columns used by model."); } + private SB toJavaNAMES(SB sb, SB fileContextSB) { + String namesHolderClassName = "NamesHolder"; + sb.i().p("// ").p("Names of columns used by model.").nl(); + sb.i().p("public static final String[] NAMES = NamesHolder.VALUES;").nl(); + // Generate class which fills the names into array + fileContextSB.i().p("// The class representing training column names ").nl(); + JCodeGen.toClassWithArray(fileContextSB, null, namesHolderClassName, _names); + return sb; + } protected SB toJavaNCLASSES( SB sb ) { return isClassifier() ? JCodeGen.toStaticVar(sb, "NCLASSES", nclasses(), "Number of output classes included in training data response column.") : sb; } private SB toJavaDOMAINS( SB sb, SB fileContextSB ) { sb.nl(); @@ -667,11 +667,13 @@ private SB toJavaDOMAINS( SB sb, SB fileContextSB ) { String[] dom = _domains[i]; String colInfoClazz = "ColInfo_"+i; sb.i(1).p("/* ").p(_names[i]).p(" */ "); - sb.p(colInfoClazz).p(".VALUES"); + if (dom != null) sb.p(colInfoClazz).p(".VALUES"); else sb.p("null"); if (i!=_domains.length-1) sb.p(','); sb.nl(); - fileContextSB.i().p("// The class representing column ").p(_names[i]).nl(); - JCodeGen.toClassWithArray(fileContextSB, null, colInfoClazz, dom); + if (dom != null) { + fileContextSB.i().p("// The class representing column ").p(_names[i]).nl(); + JCodeGen.toClassWithArray(fileContextSB, null, colInfoClazz, dom); + } } return sb.i().p("};").nl(); } @@ -710,21 +712,6 @@ private SB toJavaPredict(SB ccsb, SB fileCtxSb) { // ccsb = classContext protected String toJavaDefaultMaxIters() { return "-1"; } - // Convenience method for testing: build Java, convert it to a class & - // execute it: compare the results of the new class's (JIT'd) scoring with - // the built-in (interpreted) scoring on this dataset. Throws if there - // is any error (typically an AssertionError). - public void testJavaScoring( Frame fr ) { - try { - //System.out.println(toJava()); - Class clz = ClassPool.getDefault().toClass(makeCtClass()); - Object modelo = clz.newInstance(); - } - catch( CannotCompileException cce ) { throw new Error(cce); } - catch( InstantiationException cce ) { throw new Error(cce); } - catch( IllegalAccessException cce ) { throw new Error(cce); } - } - /** Generates code which unify preds[1,...NCLASSES] */ protected void toJavaUnifyPreds(SB bodySb) { } diff --git a/src/main/java/water/api/GarbageCollect.java b/src/main/java/water/api/GarbageCollect.java new file mode 100644 index 0000000000..b4fa52f71d --- /dev/null +++ b/src/main/java/water/api/GarbageCollect.java @@ -0,0 +1,34 @@ +package water.api; + +import water.DTask; +import water.H2O; +import water.H2ONode; +import water.RPC; +import water.util.Log; + +public class GarbageCollect extends Request { + private static class GCTask extends DTask { + public GCTask() { + } + + @Override public void compute2() { + Log.info("Calling System.gc() now..."); + System.gc(); + Log.info("System.gc() finished"); + tryComplete(); + } + + @Override public byte priority() { + return H2O.MIN_HI_PRIORITY; + } + } + + @Override public RequestBuilders.Response serve(){ + for (H2ONode node : H2O.CLOUD._memary) { + GCTask t = new GCTask(); + new RPC(node, t).call().get(); + } + + return RequestBuilders.Response.doneEmpty(); + } +} diff --git a/src/main/java/water/api/RequestServer.java b/src/main/java/water/api/RequestServer.java index 34e77bf916..4ef5f621f0 100644 --- a/src/main/java/water/api/RequestServer.java +++ b/src/main/java/water/api/RequestServer.java @@ -172,6 +172,7 @@ public enum API_VERSION { registerRequest(new UnlockKeys()); registerRequest(new Order()); registerRequest(new RemoveVec()); + registerRequest(new GarbageCollect()); } else { Request.addToNavbar(registerRequest(new MatrixMultiply()), "Matrix Multiply", "Beta"); Request.addToNavbar(registerRequest(new hex.LR2()), "Linear Regression2", "Beta"); @@ -189,6 +190,7 @@ public enum API_VERSION { Request.addToNavbar(registerRequest(new UnlockKeys()), "Unlock Keys (use with caution)","Beta"); Request.addToNavbar(registerRequest(new Order()), "Order", "Beta"); Request.addToNavbar(registerRequest(new RemoveVec()), "RemoveVec", "Beta"); + Request.addToNavbar(registerRequest(new GarbageCollect()), "GarbageCollect", "Beta"); } registerRequest(new Up()); diff --git a/src/main/java/water/api/SaveModel.java b/src/main/java/water/api/SaveModel.java index 12149fd697..72b27d2bbb 100644 --- a/src/main/java/water/api/SaveModel.java +++ b/src/main/java/water/api/SaveModel.java @@ -1,6 +1,7 @@ package water.api; import java.io.*; + import static water.util.FSUtils.isHdfs; import static water.util.FSUtils.isS3N; @@ -8,6 +9,7 @@ import java.io.IOException; import hex.glm.GLMModel; + import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -16,6 +18,7 @@ import water.serial.Model2FileBinarySerializer; import water.serial.Model2HDFSBinarySerializer; import water.util.FSUtils; +import water.util.JCodeGen; public class SaveModel extends Func { static final int API_WEAVER = 1; @@ -48,7 +51,7 @@ private void saveToLocalFS() { // Create folder parentDir.mkdirs(); // Save parent model - new Model2FileBinarySerializer().save(model, new File(parentDir, model._key.toString())); + new Model2FileBinarySerializer().save(model, new File(parentDir, JCodeGen.toJavaId(model._key.toString()))); // Write to model_names File model_names = new File(parentDir, "model_names"); FileOutputStream is = new FileOutputStream(model_names); @@ -61,8 +64,8 @@ private void saveToLocalFS() { Model[] models = getCrossValModels(model); System.out.println(models); for (Model m : models) { - new Model2FileBinarySerializer().save(m, new File(parentDir, m._key.toString())); - br.write(m._key.toString()); + new Model2FileBinarySerializer().save(m, new File(parentDir, JCodeGen.toJavaId(m._key.toString()))); + br.write(JCodeGen.toJavaId(m._key.toString())); br.newLine(); } } @@ -80,7 +83,7 @@ private void saveToHdfs() { if (force && fs.exists(parentDir)) fs.delete(parentDir); fs.mkdirs(parentDir); // Save parent model - new Model2HDFSBinarySerializer(fs, force).save(model, new Path(parentDir, model._key.toString())); + new Model2HDFSBinarySerializer(fs, force).save(model, new Path(parentDir, JCodeGen.toJavaId(model._key.toString()))); // Save parent model key to model_names file Path model_names = new Path(parentDir, "model_names"); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(model_names,true))); @@ -89,8 +92,8 @@ private void saveToHdfs() { if (save_cv) { Model[] models = getCrossValModels(model); for (Model m : models ) { - new Model2HDFSBinarySerializer(fs, force).save(m, new Path(parentDir, m._key.toString())); - br.write(m._key.toString()); + new Model2HDFSBinarySerializer(fs, force).save(m, new Path(parentDir, JCodeGen.toJavaId(m._key.toString()))); + br.write(JCodeGen.toJavaId(m._key.toString())); br.newLine(); } } diff --git a/src/main/java/water/util/JCodeGen.java b/src/main/java/water/util/JCodeGen.java index 04790954b5..e2c6f2aab5 100644 --- a/src/main/java/water/util/JCodeGen.java +++ b/src/main/java/water/util/JCodeGen.java @@ -5,6 +5,8 @@ public class JCodeGen { + public static final String[] EMPTY_SA = new String[] {} ; + /** Generates data sample as a dedicated class with static double[][] member. */ public static SB toClass(SB sb, String classSig, String varname, Frame f, int nrows, String comment) { sb.p(classSig).p(" {").nl().ii(1); diff --git a/src/test/java/hex/drf/DRFTest.java b/src/test/java/hex/drf/DRFTest.java index 58b4bdc7ec..3152976833 100644 --- a/src/test/java/hex/drf/DRFTest.java +++ b/src/test/java/hex/drf/DRFTest.java @@ -7,6 +7,7 @@ import static org.junit.Assert.assertEquals; import water.*; +import water.api.AUC; import water.api.DRFModelView; import water.fvec.Frame; import water.fvec.RebalanceDataSet; @@ -100,7 +101,7 @@ abstract static class PrepData { abstract int prep(Frame fr); } } catch( IllegalArgumentException iae ) { /*pass*/ } } - @Test public void testBadData() throws Throwable { + @Ignore @Test public void testBadData() throws Throwable { basicDRFTestOOBE( "./smalldata/test/drf_infinitys.csv","infinitys.hex", new PrepData() { @Override int prep(Frame fr) { return fr.find("DateofBirth"); } }, @@ -161,8 +162,8 @@ public void testCreditProstate1() throws Throwable { return fr.find("IsDepDelayed"); } }, 50, - a( a(13987, 6900), - a( 6147,16944)), + a( a(13941, 6946), + a( 5885,17206)), s("NO", "YES")); } @@ -272,4 +273,51 @@ public void basicDRF(String fnametrain, String hexnametrain, String fnametest, S assertEquals(mses[i], mses[0], 1e-15); } } + + public static class repro { + @Ignore + @Test public void testAirline() throws InterruptedException { + Frame tfr=null; + Frame test=null; + + Scope.enter(); + try { + // Load data, hack frames + tfr = parseFrame(Key.make("air.hex"), "/users/arno/sz_bench_data/train-1m.csv"); + test = parseFrame(Key.make("airt.hex"), "/users/arno/sz_bench_data/test.csv"); + for (int i : new int[]{0,1,2}) { + tfr.vecs()[i] = tfr.vecs()[i].toEnum(); + test.vecs()[i] = test.vecs()[i].toEnum(); + } + + DRF parms = new DRF(); + parms.source = tfr; + parms.validation = test; +// parms.ignored_cols_by_name = new int[]{4,5,6}; +// parms.ignored_cols_by_name = new int[]{0,1,2,3,4,5,7}; + parms.response = tfr.lastVec(); + parms.nbins = 20; + parms.ntrees = 100; + parms.max_depth = 20; + parms.mtries = -1; + parms.sample_rate = 0.667f; + parms.min_rows = 10; + parms.classification = true; + parms.seed = 12; + + DRFModel drf = parms.fork().get(); + Frame pred = drf.score(test); + AUC auc = new AUC(); + auc.vactual = test.lastVec(); + auc.vpredict = pred.lastVec(); + auc.invoke(); + Log.info("Test set AUC: " + auc.data().AUC); + drf.delete(); + } finally{ + if (tfr != null) tfr.delete(); + if (test != null) test.delete(); + } + Scope.exit(); + } + } } diff --git a/src/test/java/hex/gbm/GBMTest.java b/src/test/java/hex/gbm/GBMTest.java index d2925cdc37..bdb11f644a 100644 --- a/src/test/java/hex/gbm/GBMTest.java +++ b/src/test/java/hex/gbm/GBMTest.java @@ -7,6 +7,7 @@ import org.junit.Assert; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import water.*; @@ -341,6 +342,7 @@ public GBMModel basicGBM(String fname, String hexname, PrepData prep, boolean va Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", mseWithoutVal, mseWithVal, 0.0001); } + @Ignore @Test public void testModelMSEEqualityOnTitanic() { final PrepData titanicPrep = new PrepData() {