Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/h2oai/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
spennihana committed Jun 8, 2015
2 parents 93f1f5f + 7ab449c commit b88db1e
Show file tree
Hide file tree
Showing 34 changed files with 1,116 additions and 49 deletions.
2 changes: 1 addition & 1 deletion R/ensemble/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ R CMD INSTALL h2o/R/ensemble/h2oEnsemble-package
- Install in R using `devtools::install_github`:
```
library(devtools)
install_github("h2oai/h2o/R/ensemble/h2oEnsemble-package")
install_github("h2oai/h2o-2/R/ensemble/h2oEnsemble-package")
```

## Create Ensembles
Expand Down
2 changes: 1 addition & 1 deletion R/h2o-package/R/Algorithms.R
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
noGrid <- noGrid && (missing(override_with_best_model) || length(override_with_best_model) == 1)
noGrid <- noGrid && (missing(seed) || length(seed) == 1)
noGrid <- noGrid && (missing(input_dropout_ratio) || length(input_dropout_ratio) == 1)
noGrid <- noGrid && (missing(hidden_dropout_ratios) || (!is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1))
noGrid <- noGrid && (missing(hidden_dropout_ratios) || !(is.list(hidden_dropout_ratios) && length(hidden_dropout_ratios) > 1))
noGrid <- noGrid && (missing(max_w2) || length(max_w2) == 1)
noGrid <- noGrid && (missing(initial_weight_distribution) || length(initial_weight_distribution) == 1)
noGrid <- noGrid && (missing(initial_weight_scale) || length(initial_weight_scale) == 1)
Expand Down
11 changes: 11 additions & 0 deletions R/h2o-package/R/Internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ h2o.setLogPath <- function(path, type) {
.h2o.__SET_DOMAIN = "2/SetDomains.json"
.h2o.__PAGE_ALLMODELS = "2/Models.json"
.h2o.__GAINS <- "2/GainsLiftTable.json"
.h2o.__PAGE_GARBAGECOLLECT = "GarbageCollect.json"

.h2o.__PAGE_IMPUTE= "2/Impute.json"
.h2o.__PAGE_EXEC2 = "2/Exec2.json"
Expand Down Expand Up @@ -1054,3 +1055,13 @@ h2o.getFrame <- function(h2o, key) {
"gamma" = Gamma(link))
}
}

#
# This function is internal intentionally.
#
# Call it as:
# h2o:::.h2o.garbageCollect(localH2O)
#
.h2o.garbageCollect <- function(client) {
res = .h2o.__remoteSend(client, .h2o.__PAGE_GARBAGECOLLECT)
}
49 changes: 49 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_186KRows_3.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 186K rows and 3.2K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
paste("Time it took to parse", parse_time[[1]])

colNames = {}
for(col in names(data.hex)) {
colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
colNames = append(colNames, colName)
}

colNames[1] <- "C1"
names(data.hex) <- colNames

myY = colNames[1]
myX = setdiff(names(data.hex), myY)

# Start modeling
#Deep Learning
dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY, data=data.hex,
epochs=.1, hidden=c(5,5)))
paste("Time it took to build DL ", dl_time[[1]])
data1.dl

PASS_BANNER()
43 changes: 43 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_1MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/1Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# DL
dl_time <- system.time(mdl.dl <- h2o.deeplearning(x=predictors, y=response,
data=data.hex, replicate_training_data=FALSE, epochs=.1, hidden=c(5,5)))
mdl.dl
paste("Time it took to build DL ", dl_time[[1]])

PASS_BANNER()

58 changes: 58 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_DL_airlines_billion_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#----------------------------------------------------------------------
# Purpose: This test exercises HDFS operations from R.
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(testthat)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/airlinesbillion.csv"

#----------------------------------------------------------------------
# Single file cases.
#----------------------------------------------------------------------

heading("Testing single file importHDFS")
url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

data1.hex <- data.hex

n <- nrow(data.hex)
print(n)
if (n != 1166952590) {
stop("nrows is wrong")
}

#Constructing validation and train sets by sampling (20/80)
#creating a column as tall as airlines(nrow(air))
s <- h2o.runif(data.hex) # Useful when number of rows too large for R to handle
data.train <- data.hex[s <= 0.8,]
data.valid <- data.hex[s > 0.8,]

## Chose which col as response
## Response = IsDepDelayed
myY = "C31"
myX = setdiff(names(data1.hex), myY)

dl_time <- system.time(data1.dl <- h2o.deeplearning(x=myX, y=myY,
data=data.train, validation=data.valid, replicate_training_data=FALSE,
epochs=.1, hidden=c(5,5)))
data1.dl
paste("Time it took to build DL ", dl_time[[1]])

PASS_BANNER()
42 changes: 42 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_15MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/15Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# Gradient Boosted Trees
gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response, data=data.hex, distribution = "bernoulli"))
mdl.gbm
paste("Time it took to build GBM ", gbm_time[[1]])

PASS_BANNER()

49 changes: 49 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_186KRows_3.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 186K rows and 3.2K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c25/df_h2o.csv")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c25/df_h2o.csv", header = T))
paste("Time it took to parse", parse_time[[1]])

colNames = {}
for(col in names(data.hex)) {
colName <- if(is.na(as.numeric(col))) col else paste0("C", as.character(col))
colNames = append(colNames, colName)
}

colNames[1] <- "C1"
names(data.hex) <- colNames

myY = colNames[1]
myX = setdiff(names(data.hex), myY)

# Start modeling
#GBM on original dataset
gbm_time <- system.time(data1.gbm <- h2o.gbm(x = myX, y = myY, data = data.hex,
n.trees = 10, interaction.depth = 5, distribution = "multinomial"))
paste("Time it took to build GBM ", gbm_time[[1]])
data1.gbm

PASS_BANNER()
43 changes: 43 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_1MRows_2.2KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

#----------------------------------------------------------------------
# Purpose: This test exercises building 15MRows2KCols
#
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

hdfs_data_file = "/datasets/1Mx2.2k.csv"
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------

url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_data_file)
parse_time <- system.time(data.hex <- h2o.importFile(conn, url))
paste("Time it took to parse", parse_time[[1]])

response=1 #1:1000 imbalance
predictors=c(3:ncol(data.hex))

# Start modeling
# Gradient Boosted Trees
gbm_time <- system.time(mdl.gbm <- h2o.gbm(x=predictors, y=response,
data=data.hex, distribution = "bernoulli"))
mdl.gbm
paste("Time it took to build GBM ", gbm_time[[1]])

PASS_BANNER()

46 changes: 46 additions & 0 deletions R/tests/testdir_hdfs_xlarge/runit_GBM_376KRows_6KCols_xlarge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#----------------------------------------------------------------------
# Purpose: This test exercises building GLM/GBM/DL model
# for 376K rows and 6.9K columns
#----------------------------------------------------------------------

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

ipPort <- get_args(commandArgs(trailingOnly = TRUE))
myIP <- ipPort[[1]]
myPort <- ipPort[[2]]
hdfs_name_node <- Sys.getenv(c("NAME_NODE"))
print(hdfs_name_node)

library(RCurl)
library(h2o)

running_inside_hexdata = file.exists("/mnt/0xcustomer-datasets/c28")

heading("BEGIN TEST")
conn <- h2o.init(ip=myIP, port=myPort, startH2O = FALSE)
h2o.removeAll()

h2o.ls(conn)
#----------------------------------------------------------------------
# Parameters for the test.
#----------------------------------------------------------------------
parse_time <- system.time(data.hex <- h2o.importFile(conn, "/mnt/0xcustomer-datasets/c28/mr_output.tsv.sorted.gz"))
paste("Time it took to parse", parse_time[[1]])

dim(data.hex)

s = h2o.runif(data.hex)
train = data.hex[s <= 0.8,]
valid = data.hex[s > 0.8,]

#GBM model
gbm_time <- system.time(model.gbm <- h2o.gbm(x = 3:(ncol(train)), y = 2,
data = train, validation=valid, n.trees=10, interaction.depth=5))
paste("Time it took to build GBM ", gbm_time[[1]])
model.gbm

pred = h2o.predict(model.gbm, valid)
perf <- h2o.performance(pred[,3], valid[,2])

PASS_BANNER()
Loading

0 comments on commit b88db1e

Please sign in to comment.