Skip to content

Commit

Permalink
Merge branch 'master' into tomk-repackage-gson
Browse files Browse the repository at this point in the history
Conflicts:
	R/tests/testdir_javapredict/runit_PDATA_DRF_javapredict_churn_rand_large.R
  • Loading branch information
tomkraljevic committed Sep 23, 2014
2 parents b3c1adc + 2863166 commit d592cd9
Show file tree
Hide file tree
Showing 239 changed files with 2,448 additions and 2,714 deletions.
249 changes: 249 additions & 0 deletions R/examples/DEMO_TrainValidateTest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
#'
#' An H2O Demo: Building Many Models, Validating, and Scoring
#'


#######################
# Scraps & Extras:
#
# h2o.exportFile(preds, EXPORT_PREDS_PATH, force = TRUE)
#
# save the model
# h2o.saveModel(best_model, dir = "/home/spencer/pp_demo/", name = "best_model", force = TRUE)
#
# load model: h2o.loadModel(h, PATH_TO_SAVED_MODEL)
#
# **PATHS**
#
# For 181:
# AIRLINES_ALL_PATH <- "/home/0xdiag/datasets/airlines/airlines_all.csv"
# EXPORT_PREDS_PATH <- "/home/spencer/pp_demo/preds.csv"
# For local git:
AIRLINES_ALL_PATH <- "/Users/spencer/master/h2o/smalldata/airlines/allyears2k_headers.zip"
# EXPORT_PREDS_PATH <- "/Users/spencer/pp_demo/preds.csv"
# library(plyr)
#######################

#
##
### Begin the Demo
##
#
library(h2o)

# Assumes h2o started elsewhere
ip <- "127.0.0.1"
port <- 54321
h <- h2o.init(ip = ip, port = port)

#################################################
##### IMPORTANT VARIABLES TO SET HERE !!!! ######
#################################################
#AIRLINES_ALL_PATH <- "" # set this to the path to the airlines dataset
NUM_FEATURES <- 9 # set this to toggle the number of features to collect
#################################################
#################################################

if (AIRLINES_ALL_PATH == "") stop("AIRLINES_ALL_PATH must be set")
if (NUM_FEATURES <= 0) stop("NUM_FEATURES must be > 0")

# Read in the data
flights <- h2o.importFile(h, AIRLINES_ALL_PATH, "flights")

#################################################################################
#
# Columns of the flights data
#
#colnames(flights)
# [1] "Year" "Month" "DayofMonth"
# [4] "DayOfWeek" "DepTime" "CRSDepTime"
# [7] "ArrTime" "CRSArrTime" "UniqueCarrier"
#[10] "FlightNum" "TailNum" "ActualElapsedTime"
#[13] "CRSElapsedTime" "AirTime" "ArrDelay"
#[16] "DepDelay" "Origin" "Dest"
#[19] "Distance" "TaxiIn" "TaxiOut"
#[22] "Cancelled" "CancellationCode" "Diverted"
#[25] "CarrierDelay" "WeatherDelay" "NASDelay"
#[28] "SecurityDelay" "LateAircraftDelay" "IsArrDelayed"
#[31] "IsDepDelayed"
#################################################################################
vars <- colnames(flights)

# Suggested Explanatory Variables:
FlightDate <- vars[1:4] # "Year", "Month", "DayofMonth", "DayOfWeek"
ScheduledTimes <- vars[c(6,8,13)] # "CRSDepTime", "CRSArrTime", "CRSElapsedTime"
FlightInfo <- vars[c(9,17,18,19)] # "UniqueCarrier", "Origin", "Dest", "Distance"

# Combine the explanatory variables into a single variable
FlightsVars <- c(FlightDate, ScheduledTimes, FlightInfo)

# Response
Delayed <- vars[31] # "IsDepDelayed"
ArrivalDelayed <- vars[30] # "IsArrDelayed"

############################################################################################

# Split the flights data into train/validation/test splits of (60/10/30)
s <- h2o.runif(flights, seed = 123456789)
train <- flights[s < .6,]
valid <- flights[s >= .6 & s < .7,]
test <- flights[s >= .7,]

cat("\nTRAINING ROWS: ", nrow(train))
cat("\nVALIDATION ROWS: ", nrow(valid))
cat("\nTEST ROWS: ", nrow(test))

# Here's a function that takes a model and a testing dataset and calculates the AUC on the testdata...
test_performance <-
function(model, testdata, response) {
preds <- h2o.predict(model, testdata)

# p(success) is the last column in the frame returned by h2o.predict, that's what the ncol(preds) is for below
perf <- h2o.performance(data = preds[, ncol(preds)], reference = testdata[, response])
perf@model$auc
}

coda<-
function(t0, model, modeltype, testdata, response, top4) {
elapsed_seconds <- as.numeric(Sys.time() - t0)
modelkey <- model@key
type <- modeltype
#perform the holdout computation
test_auc <- test_performance(model, testdata, response)

result <- list(list(model, modeltype, response, elapsed_seconds, test_auc, top4))
names(result) <- model@key
return(result)
}

# Fit logistic regression for IsDepDelayed for some origin
lr.fit<-
function(response, dataset, testdata) {
print("Beginning GLM with 2-fold Cross Validation\n")
t0 <- Sys.time()
model <- h2o.glm(x = FlightsVars, y = response, data = dataset, family = "binomial", nfolds = 2, variable_importances = TRUE)
top4 <- paste(names(sort(model@model$coefficients, T))[1:NUM_FEATURES], collapse = ",", sep = ",")
coda(t0, model, "glm", testdata, response, top4)
}

rf.fit<-
function(response, dataset, testdata) {
print("Beginning Random Forest with 10 trees, 20 depth, and 2-fold Cross Validation\n")
t0 <- Sys.time()
model <- h2o.randomForest(x = FlightsVars, y = response, data = dataset, ntree = 10, depth = 20, nfolds = 2, balance.classes = T, type = "BigData", importance = TRUE)
top4 <- paste(names(sort(model@model$varimp[1,]))[1:NUM_FEATURES], collapse = ",", sep = ",")
coda(t0, model, "BigData_random_forest", testdata, response, top4)
}

srf.fit<-
function(response, dataset, testdata) {
print("Beginning Speedy Random Forest with 10 trees, 20 depth, and 2-fold Cross Validation\n")
t0 <- Sys.time()
model <- h2o.randomForest(x = FlightsVars, y = response, data = dataset, ntree = 10, depth = 20, nfolds = 2, balance.classes = T, type = "fast", importance = TRUE)
top4 <- paste(names(sort(model@model$varimp[1,]))[1:NUM_FEATURES], collapse = ",", sep = ",")
coda(t0, model, "fast_random_forest", testdata, response, top4)
}

gbm.fit<-
function(response, dataset, testdata) {
print("Beginning Gradient Boosted Machine with 50 trees, 5 depth, and 2-fold Cross Validation\n")
t0 <- Sys.time()
model <- h2o.gbm(x = FlightsVars, y = response, data = dataset, n.trees = 50, shrinkage = 1/50, nfolds = 2, balance.classes = T, importance = TRUE)
top4 <- paste(rownames(model@model$varimp)[1:NUM_FEATURES], collapse = ",", sep = ",")
coda(t0, model, "gbm", testdata, response, top4)
}

#dl.fit<-
#function(response, dataset, testdata) {
# print("Beginning Deep Learning with 3 hidden layers and 2-fold Cross Validation\n")
# t0 <- Sys.time()
# model <- h2o.deeplearning(x = FlightsVars, y = response, data = dataset,
# hidden = c(200,200,200),
# activation = "RectifierWithDropout",
# input_dropout_ratio = 0.2,
# l1 = 1e-5,
# train_samples_per_iteration = 10000,
# epochs = 100,
# nfolds = 2,
# balance_classes = T, importance = TRUE)
# coda(t0, model, "deeplearning", testdata, response)
#}

all.fit <- function(fitMethod, response, dataset, testdata) { fitMethod(response, dataset, testdata) }

#iterate over the fit fcns
model.fit.fcns <- c(lr.fit, rf.fit, srf.fit, gbm.fit)#, dl.fit)

models <- unlist(recursive = F, lapply(model.fit.fcns, all.fit, Delayed, train, valid))

##
### Now display the results in a frame sorted by AUC
##

# Use ldply to iterate over the list of models, extracting the model key, the model auc, the response, and the elapsed training time in seconds
#models.auc.response.frame <- ldply(models, function(x) {
# c(model_key = x[[1]]@key,
# model_type = x[[2]],
# train_auc = as.numeric(x[[1]]@model$auc),
# validation_auc = as.numeric(x[[5]]),
# important_feat = x[[6]],
# response = x[[3]],
# train_time = as.numeric(x[[4]]))})

#Alternative to ldply:

selectModel <- function(x) {
c(model_key = x[[1]]@key,
model_type = x[[2]],
train_auc = as.numeric(x[[1]]@model$auc),
validation_auc = as.numeric(x[[5]]),
important_feat = x[[6]],
response = x[[3]],
train_time_s = as.numeric(as.character(x[[4]])))
}

models.auc.response.frame <- as.data.frame(t(as.data.frame(lapply(models, selectModel))))
#t(lapply(models.auc.response.frame, class))

models.auc.response.frame$train_auc <- as.numeric(as.character(models.auc.response.frame$train_auc))
models.auc.response.frame$validation_auc <- as.numeric(as.character(models.auc.response.frame$validation_auc))

# sort the models by auc from worst to best
models.sort.by.auc <- models.auc.response.frame[with(models.auc.response.frame, order(response, validation_auc)),-1]
models.sort.by.auc <- models.sort.by.auc[rev(rownames(models.sort.by.auc)),]

# convert the `auc` and `train_time` columns into numerics
models.sort.by.auc$train_auc <- as.numeric(as.character(models.sort.by.auc$train_auc))
models.sort.by.auc$validation_auc <- as.numeric(as.character(models.sort.by.auc$validation_auc))
models.sort.by.auc$train_time <- as.numeric(as.character(models.sort.by.auc$train_time))

# display the frame
print(models.sort.by.auc)

# score the best model on the test data
best_model <- h2o.getModel(h, rownames(models.sort.by.auc)[1])
test_auc <- test_performance(best_model, test, Delayed) # Swap out test to any datset to do the final scoring on.
cat(paste(" -------------------------------\n",
"Best Model Performance On Final Testing Data:", "\n",
"AUC = ", test_auc, "\n",
"--------------------------------\n"))

# save the predictions
preds <- h2o.predict(best_model, test)


cat(paste(" =---------Summary------------=\n",
"Best model type: ", models.sort.by.auc[1,]$model_type, "\n",
"Best model auc on test: ", test_auc, "\n",
"Top", NUM_FEATURES, "important features: ", models.sort.by.auc[1,]$important_feat, "\n",
"Model training time: ", models.sort.by.auc[1,]$train_time_s, "\n",
"Training data rows: ", nrow(train), "\n",
"Training data cols: ", ncol(train), "\n",
"Validation data rows: ", nrow(valid), "\n",
"=----------------------------=\n"))

#
##
### End of Demo
##
#
16 changes: 15 additions & 1 deletion R/h2o-package/R/Algorithms.R
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
max_confusion_matrix_size,
max_hit_ratio_k,
balance_classes,
class_sampling_factors,
max_after_balance_size,
score_validation_sampling,
diagnostics,
Expand Down Expand Up @@ -556,6 +557,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
parms = .addIntParm(parms, k="max_confusion_matrix_size", v=max_confusion_matrix_size)
parms = .addIntParm(parms, k="max_hit_ratio_k", v=max_hit_ratio_k)
parms = .addBooleanParm(parms, k="balance_classes", v=balance_classes)
parms = .addDoubleArrayParm(parms, k="class_sampling_factors", v=class_sampling_factors)
parms = .addFloatParm(parms, k="max_after_balance_size", v=max_after_balance_size)
parms = .addStringParm(parms, k="score_validation_sampling", v=score_validation_sampling)
parms = .addBooleanParm(parms, k="diagnostics", v=diagnostics)
Expand Down Expand Up @@ -585,6 +587,18 @@ h2o.deeplearning <- function(x, y, data, key = "",
noGrid <- noGrid && (missing(momentum_stable) || length(momentum_stable) == 1)
noGrid <- noGrid && (missing(momentum_start) || length(momentum_start) == 1)
noGrid <- noGrid && (missing(nesterov_accelerated_gradient) || length(nesterov_accelerated_gradient) == 1)
noGrid <- noGrid && (missing(override_with_best_model) || length(override_with_best_model) == 1)
noGrid <- noGrid && (missing(seed) || length(seed) == 1)
noGrid <- noGrid && (missing(input_dropout_ratio) || length(input_dropout_ratio) == 1)
noGrid <- noGrid && (missing(hidden_dropout_ratios) || length(hidden_dropout_ratios) == 1)
noGrid <- noGrid && (missing(max_w2) || length(max_w2) == 1)
noGrid <- noGrid && (missing(initial_weight_distribution) || length(initial_weight_distribution) == 1)
noGrid <- noGrid && (missing(initial_weight_scale) || length(initial_weight_scale) == 1)
noGrid <- noGrid && (missing(loss) || length(loss) == 1)
noGrid <- noGrid && (missing(balance_classes) || length(balance_classes) == 1)
noGrid <- noGrid && (missing(max_after_balance_size) || length(max_after_balance_size) == 1)
noGrid <- noGrid && (missing(fast_mode) || length(fast_mode) == 1)
noGrid <- noGrid && (missing(shuffle_training_data) || length(shuffle_training_data) == 1)
if(noGrid)
.h2o.singlerun.internal("DeepLearning", data, res, nfolds, validation, parms)
else {
Expand Down Expand Up @@ -690,7 +704,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
}

if(!is.null(errs$variable_importances)) {
result$varimp <- errs$variable_importances$varimp
result$varimp <- as.data.frame(t(errs$variable_importances$varimp))
names(result$varimp) <- errs$variable_importances$variables
result$varimp <- sort(result$varimp, decreasing = TRUE)
}
Expand Down
1 change: 1 addition & 0 deletions R/h2o-package/R/Internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ h2o.setLogPath <- function(path, type) {
.h2o.__PAGE_VIEWALL = "StoreView.json"
.h2o.__DOWNLOAD_LOGS = "LogDownload.json"
.h2o.__DOMAIN_MAPPING = "2/DomainMapping.json"
.h2o.__PAGE_ALLMODELS = "2/Models.json"

.h2o.__PAGE_EXEC2 = "2/Exec2.json"
.h2o.__PAGE_IMPORTFILES2 = "2/ImportFiles2.json"
Expand Down
58 changes: 52 additions & 6 deletions R/h2o-package/R/ParseImport.R
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ h2o.ignoreColumns <- function(data, max_na = 0.2) {


# ------------------- Save H2O Model to Disk ----------------------------------------------------
h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
h2o.saveModel <- function(object, dir="", name="",save_cv=TRUE, force=FALSE) {
if(missing(object)) stop('Must specify object')
if(!inherits(object,'H2OModel')) stop('object must be an H2O model')
if(!is.character(dir)) stop('path must be of class character')
Expand All @@ -369,7 +369,7 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
# Create a model directory for each model saved that will include main model
# any cross validation models and a meta text file with all the model names listed
model_dir <- paste(dir, name, sep=.Platform$file.sep)
dir.create(model_dir)
dir.create(model_dir,showWarnings = F)

# Save main model
path <- paste(model_dir, object@key, sep=.Platform$file.sep)
Expand All @@ -378,12 +378,15 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
# Save all cross validation models
if (.hasSlot(object, "xval")) {
xval_keys <- sapply(object@xval,function(model) model@key )
if(save_cv & (length(xval_keys)==0)) stop('No cross validation models found')
if(save_cv) for (xval_key in xval_keys) .h2o.__remoteSend(object@data@h2o, .h2o.__PAGE_SaveModel, model=xval_key, path=paste(model_dir, xval_key, sep=.Platform$file.sep), force=force)
if(save_cv & !(length(xval_keys)==0)) {
for (xval_key in xval_keys) .h2o.__remoteSend(object@data@h2o, .h2o.__PAGE_SaveModel, model=xval_key, path=paste(model_dir, xval_key, sep=.Platform$file.sep), force=force)
} else {
save_cv <- FALSE # do not save CV results if they do not exist
}
} else {
save_cv <- FALSE # do not save CV results if they do not exist
save_cv <- FALSE # if no xval slot (Naive Bayes) no CV models
}

# Create new file called model_names and write all model names to file
fileConn <- file(paste(model_dir, "model_names", sep=.Platform$file.sep))
if(save_cv) {writeLines(text = c(object@key, xval_keys), con = fileConn)
Expand All @@ -395,6 +398,30 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
dirname(res$path)
}

# ------------------- Save All H2O Model to Disk --------------------------------------------------

h2o.saveAll <- function(object, dir="", save_cv=TRUE, force=FALSE) {
if(missing(object)) stop('Must specify object')
if(class(object) != 'H2OClient') stop('object must be of class H2OClient')

## Grab all the model keys in H2O
res = .h2o.__remoteSend(client = object, page = .h2o.__PAGE_ALLMODELS)
keys = names(res$models)

## Delete Duplicate Keys (this will avoid saving cross validation models multiple times for non-GLM models)
duplicates = {}
for(key in keys) { dups = grep(pattern = paste(key, "_", sep = ""), x = keys)
duplicates = append(x = duplicates, values = dups)
}
keys = keys[-duplicates]

## Create H2OModel objects in R (To grab the cross validation models)
models = lapply(keys, function(model_key) h2o.getModel(h2o = object, key = model_key))
m_path = sapply(models, function(model_obj) h2o.saveModel(model_obj, dir=dir, save_cv=save_cv, force=force) )
m_path
}


# ------------------- Load H2O Model from Disk ----------------------------------------------------
h2o.loadModel <- function(object, path="") {
if(missing(object)) stop('Must specify object')
Expand All @@ -411,3 +438,22 @@ h2o.loadModel <- function(object, path="") {
if(length(model_names)>0) for (key in model_names) .h2o.__remoteSend(object, .h2o.__PAGE_LoadModel, path = paste(path, key, sep=.Platform$file.sep) )
h2o.getModel(object, model_names[1])
}


# ------------------- Load All H2O Model in a directory from Disk -----------------------------------------------
h2o.loadAll <- function(object, dir="") {
if(missing(object)) stop('Must specify object')
if(class(object) != 'H2OClient') stop('object must be of class H2OClient')
if(!is.character(dir)) stop('dir must be of class character')

model_dirs = setdiff(list.dirs(dir), dir)
model_objs = {}
for(model_dir in model_dirs) {
print(paste("Loading ", basename(model_dir), "....",sep = ""))
temp_model = h2o.loadModel(object, path = model_dir)
model_objs = append(x = model_objs, values = temp_model)
}

model_objs
}

Loading

0 comments on commit d592cd9

Please sign in to comment.