Merge branch 'master' into tomk-repackage-gson

Conflicts: R/tests/testdir_javapredict/runit_PDATA_DRF_javapredict_churn_rand_large.R
hansight · Sep 23, 2014 · d592cd9 · d592cd9
2 parents b3c1adc + 2863166
commit d592cd9
Show file tree

Hide file tree

Showing 239 changed files with 2,448 additions and 2,714 deletions.
diff --git a/R/examples/DEMO_TrainValidateTest.R b/R/examples/DEMO_TrainValidateTest.R
@@ -0,0 +1,249 @@
+#'
+#' An H2O Demo: Building Many Models, Validating, and Scoring
+#'
+
+
+#######################
+# Scraps & Extras:
+#
+# h2o.exportFile(preds, EXPORT_PREDS_PATH, force = TRUE)
+#
+# save the model
+# h2o.saveModel(best_model, dir = "/home/spencer/pp_demo/", name = "best_model", force = TRUE)
+#
+# load model: h2o.loadModel(h, PATH_TO_SAVED_MODEL)
+#
+# **PATHS**
+#
+# For 181:
+# AIRLINES_ALL_PATH <- "/home/0xdiag/datasets/airlines/airlines_all.csv"
+# EXPORT_PREDS_PATH <- "/home/spencer/pp_demo/preds.csv"
+# For local git:
+AIRLINES_ALL_PATH <- "/Users/spencer/master/h2o/smalldata/airlines/allyears2k_headers.zip"
+# EXPORT_PREDS_PATH <- "/Users/spencer/pp_demo/preds.csv"
+# library(plyr)
+#######################
+
+#
+##
+### Begin the Demo
+##
+#
+library(h2o)
+
+# Assumes h2o started elsewhere
+ip <- "127.0.0.1"
+port <- 54321
+h <- h2o.init(ip = ip, port = port)
+
+#################################################
+##### IMPORTANT VARIABLES TO SET HERE !!!! ######
+#################################################
+#AIRLINES_ALL_PATH <- ""   # set this to the path to the airlines dataset
+NUM_FEATURES <- 9        # set this to toggle the number of features to collect
+#################################################
+#################################################
+
+if (AIRLINES_ALL_PATH == "") stop("AIRLINES_ALL_PATH must be set")
+if (NUM_FEATURES <= 0) stop("NUM_FEATURES must be > 0")
+
+# Read in the data
+flights <- h2o.importFile(h, AIRLINES_ALL_PATH, "flights")
+
+#################################################################################
+#
+# Columns of the flights data
+#
+#colnames(flights)
+# [1] "Year"              "Month"             "DayofMonth"
+# [4] "DayOfWeek"         "DepTime"           "CRSDepTime"
+# [7] "ArrTime"           "CRSArrTime"        "UniqueCarrier"
+#[10] "FlightNum"         "TailNum"           "ActualElapsedTime"
+#[13] "CRSElapsedTime"    "AirTime"           "ArrDelay"
+#[16] "DepDelay"          "Origin"            "Dest"
+#[19] "Distance"          "TaxiIn"            "TaxiOut"
+#[22] "Cancelled"         "CancellationCode"  "Diverted"
+#[25] "CarrierDelay"      "WeatherDelay"      "NASDelay"
+#[28] "SecurityDelay"     "LateAircraftDelay" "IsArrDelayed"
+#[31] "IsDepDelayed"
+#################################################################################
+vars <- colnames(flights)
+
+# Suggested Explanatory Variables:
+FlightDate     <- vars[1:4]        # "Year", "Month", "DayofMonth", "DayOfWeek"
+ScheduledTimes <- vars[c(6,8,13)]  # "CRSDepTime", "CRSArrTime", "CRSElapsedTime"
+FlightInfo     <- vars[c(9,17,18,19)] # "UniqueCarrier", "Origin", "Dest", "Distance"
+
+# Combine the explanatory variables into a single variable
+FlightsVars <- c(FlightDate, ScheduledTimes, FlightInfo)
+
+# Response
+Delayed        <- vars[31]         # "IsDepDelayed"
+ArrivalDelayed <- vars[30]         # "IsArrDelayed"
+
+############################################################################################
+
+# Split the flights data into train/validation/test splits of (60/10/30)
+s <- h2o.runif(flights, seed = 123456789)
+train <- flights[s < .6,]
+valid <- flights[s >= .6 & s < .7,]
+test  <- flights[s >= .7,]
+
+cat("\nTRAINING ROWS: ", nrow(train))
+cat("\nVALIDATION ROWS: ", nrow(valid))
+cat("\nTEST ROWS: ", nrow(test))
+
+# Here's a function that takes a model and a testing dataset and calculates the AUC on the testdata...
+test_performance <-
+function(model, testdata, response) {
+  preds <- h2o.predict(model, testdata)
+
+  # p(success)  is the last column in the frame returned by h2o.predict, that's what the ncol(preds) is for below
+  perf  <- h2o.performance(data = preds[, ncol(preds)], reference = testdata[, response])
+  perf@model$auc
+}
+
+coda<-
+function(t0, model, modeltype, testdata, response, top4) {
+  elapsed_seconds <- as.numeric(Sys.time() - t0)
+  modelkey <- model@key
+  type <- modeltype
+  #perform the holdout computation
+  test_auc <- test_performance(model, testdata, response)
+
+  result <- list(list(model, modeltype, response, elapsed_seconds, test_auc, top4))
+  names(result) <- model@key
+  return(result)
+}
+
+# Fit logistic regression for IsDepDelayed for some origin
+lr.fit<-
+function(response, dataset, testdata) {
+  print("Beginning GLM with 2-fold Cross Validation\n")
+  t0 <- Sys.time()
+  model <- h2o.glm(x = FlightsVars, y = response, data = dataset, family = "binomial", nfolds = 2, variable_importances = TRUE)
+  top4 <- paste(names(sort(model@model$coefficients, T))[1:NUM_FEATURES], collapse = ",", sep = ",")
+  coda(t0, model, "glm", testdata, response, top4)
+}
+
+rf.fit<-
+function(response, dataset, testdata) {
+  print("Beginning Random Forest with 10 trees, 20 depth, and 2-fold Cross Validation\n")
+  t0 <- Sys.time()
+  model <- h2o.randomForest(x = FlightsVars, y = response, data = dataset, ntree = 10, depth = 20, nfolds = 2, balance.classes = T, type = "BigData", importance = TRUE)
+  top4 <- paste(names(sort(model@model$varimp[1,]))[1:NUM_FEATURES], collapse = ",", sep = ",")
+  coda(t0, model, "BigData_random_forest", testdata, response, top4)
+}
+
+srf.fit<-
+function(response, dataset, testdata) {
+  print("Beginning Speedy Random Forest with 10 trees, 20 depth, and 2-fold Cross Validation\n")
+  t0 <- Sys.time()
+  model <- h2o.randomForest(x = FlightsVars, y = response, data = dataset, ntree = 10, depth = 20, nfolds = 2, balance.classes = T, type = "fast", importance = TRUE)
+  top4 <- paste(names(sort(model@model$varimp[1,]))[1:NUM_FEATURES], collapse = ",", sep = ",")
+  coda(t0, model, "fast_random_forest", testdata, response, top4)
+}
+
+gbm.fit<-
+function(response, dataset, testdata) {
+  print("Beginning Gradient Boosted Machine with 50 trees, 5 depth, and 2-fold Cross Validation\n")
+  t0 <- Sys.time()
+  model <- h2o.gbm(x = FlightsVars, y = response, data = dataset, n.trees = 50, shrinkage = 1/50, nfolds = 2, balance.classes = T, importance = TRUE)
+  top4 <- paste(rownames(model@model$varimp)[1:NUM_FEATURES], collapse = ",", sep = ",")
+  coda(t0, model, "gbm", testdata, response, top4)
+}
+
+#dl.fit<-
+#function(response, dataset, testdata) {
+#  print("Beginning Deep Learning with 3 hidden layers and 2-fold Cross Validation\n")
+#  t0 <- Sys.time()
+#  model <- h2o.deeplearning(x = FlightsVars, y = response, data = dataset,
+#                            hidden = c(200,200,200),
+#                            activation = "RectifierWithDropout",
+#                            input_dropout_ratio = 0.2,
+#                            l1 = 1e-5,
+#                            train_samples_per_iteration = 10000,
+#                            epochs = 100,
+#                            nfolds = 2,
+#                            balance_classes = T, importance = TRUE)
+#  coda(t0, model, "deeplearning", testdata, response)
+#}
+
+all.fit <- function(fitMethod, response, dataset, testdata) { fitMethod(response, dataset, testdata) }
+
+#iterate over the fit fcns
+model.fit.fcns <- c(lr.fit, rf.fit, srf.fit, gbm.fit)#, dl.fit)
+
+models <- unlist(recursive = F, lapply(model.fit.fcns, all.fit, Delayed, train, valid))
+
+##
+### Now display the results in a frame sorted by AUC
+##
+
+# Use ldply to iterate over the list of models, extracting the model key, the model auc, the response, and the elapsed training time in seconds
+#models.auc.response.frame <- ldply(models, function(x) {
+#                                  c(model_key = x[[1]]@key,
+#                                    model_type = x[[2]],
+#                                    train_auc = as.numeric(x[[1]]@model$auc),
+#                                    validation_auc = as.numeric(x[[5]]),
+#                                    important_feat = x[[6]],
+#                                    response = x[[3]],
+#                                    train_time = as.numeric(x[[4]]))})
+
+#Alternative to ldply:
+
+selectModel <- function(x) {
+  c(model_key = x[[1]]@key,
+  model_type = x[[2]],
+  train_auc = as.numeric(x[[1]]@model$auc),
+  validation_auc = as.numeric(x[[5]]),
+  important_feat = x[[6]],
+  response = x[[3]],
+  train_time_s = as.numeric(as.character(x[[4]])))
+}
+
+models.auc.response.frame <- as.data.frame(t(as.data.frame(lapply(models, selectModel))))
+#t(lapply(models.auc.response.frame, class))
+
+models.auc.response.frame$train_auc <- as.numeric(as.character(models.auc.response.frame$train_auc))
+models.auc.response.frame$validation_auc <- as.numeric(as.character(models.auc.response.frame$validation_auc))
+
+# sort the models by auc from worst to best
+models.sort.by.auc <- models.auc.response.frame[with(models.auc.response.frame, order(response, validation_auc)),-1]
+models.sort.by.auc <- models.sort.by.auc[rev(rownames(models.sort.by.auc)),]
+
+# convert the `auc` and `train_time` columns into numerics
+models.sort.by.auc$train_auc       <- as.numeric(as.character(models.sort.by.auc$train_auc))
+models.sort.by.auc$validation_auc  <- as.numeric(as.character(models.sort.by.auc$validation_auc))
+models.sort.by.auc$train_time      <- as.numeric(as.character(models.sort.by.auc$train_time))
+
+# display the frame
+print(models.sort.by.auc)
+
+# score the best model on the test data
+best_model <- h2o.getModel(h, rownames(models.sort.by.auc)[1])
+test_auc <- test_performance(best_model, test, Delayed)  # Swap out test to any datset to do the final scoring on.
+cat(paste(" -------------------------------\n",
+          "Best Model Performance On Final Testing Data:", "\n",
+          "AUC = ", test_auc, "\n",
+          "--------------------------------\n"))
+
+# save the predictions
+preds <- h2o.predict(best_model, test)
+
+
+cat(paste(" =---------Summary------------=\n",
+            "Best model type: ", models.sort.by.auc[1,]$model_type, "\n",
+            "Best model auc on test: ", test_auc, "\n",
+            "Top", NUM_FEATURES, "important features: ", models.sort.by.auc[1,]$important_feat, "\n",
+            "Model training time: ", models.sort.by.auc[1,]$train_time_s, "\n",
+            "Training data rows: ", nrow(train), "\n",
+            "Training data cols: ", ncol(train), "\n",
+            "Validation data rows: ", nrow(valid), "\n",
+           "=----------------------------=\n"))
+
+#
+##
+### End of Demo
+##
+#
diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R
@@ -456,6 +456,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
                              max_confusion_matrix_size,
                              max_hit_ratio_k,
                              balance_classes,
+                             class_sampling_factors,
                              max_after_balance_size,
                              score_validation_sampling,
                              diagnostics,
@@ -556,6 +557,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
   parms = .addIntParm(parms, k="max_confusion_matrix_size", v=max_confusion_matrix_size)
   parms = .addIntParm(parms, k="max_hit_ratio_k", v=max_hit_ratio_k)
   parms = .addBooleanParm(parms, k="balance_classes", v=balance_classes)
+  parms = .addDoubleArrayParm(parms, k="class_sampling_factors", v=class_sampling_factors)
   parms = .addFloatParm(parms, k="max_after_balance_size", v=max_after_balance_size)
   parms = .addStringParm(parms, k="score_validation_sampling", v=score_validation_sampling)
   parms = .addBooleanParm(parms, k="diagnostics", v=diagnostics)
@@ -585,6 +587,18 @@ h2o.deeplearning <- function(x, y, data, key = "",
   noGrid <- noGrid && (missing(momentum_stable) || length(momentum_stable) == 1)
   noGrid <- noGrid && (missing(momentum_start) || length(momentum_start) == 1)
   noGrid <- noGrid && (missing(nesterov_accelerated_gradient) || length(nesterov_accelerated_gradient) == 1)
+  noGrid <- noGrid && (missing(override_with_best_model) || length(override_with_best_model) == 1)
+  noGrid <- noGrid && (missing(seed) || length(seed) == 1)
+  noGrid <- noGrid && (missing(input_dropout_ratio) || length(input_dropout_ratio) == 1)
+  noGrid <- noGrid && (missing(hidden_dropout_ratios) || length(hidden_dropout_ratios) == 1)
+  noGrid <- noGrid && (missing(max_w2) || length(max_w2) == 1)
+  noGrid <- noGrid && (missing(initial_weight_distribution) || length(initial_weight_distribution) == 1)
+  noGrid <- noGrid && (missing(initial_weight_scale) || length(initial_weight_scale) == 1)
+  noGrid <- noGrid && (missing(loss) || length(loss) == 1)
+  noGrid <- noGrid && (missing(balance_classes) || length(balance_classes) == 1)
+  noGrid <- noGrid && (missing(max_after_balance_size) || length(max_after_balance_size) == 1)
+  noGrid <- noGrid && (missing(fast_mode) || length(fast_mode) == 1)
+  noGrid <- noGrid && (missing(shuffle_training_data) || length(shuffle_training_data) == 1)
   if(noGrid)
     .h2o.singlerun.internal("DeepLearning", data, res, nfolds, validation, parms)
   else {
@@ -690,7 +704,7 @@ h2o.deeplearning <- function(x, y, data, key = "",
   }
 
   if(!is.null(errs$variable_importances)) {
-    result$varimp <- errs$variable_importances$varimp
+    result$varimp <- as.data.frame(t(errs$variable_importances$varimp))
     names(result$varimp) <- errs$variable_importances$variables
     result$varimp <- sort(result$varimp, decreasing = TRUE)
   }

diff --git a/R/h2o-package/R/Internal.R b/R/h2o-package/R/Internal.R
@@ -104,6 +104,7 @@ h2o.setLogPath <- function(path, type) {
 .h2o.__PAGE_VIEWALL = "StoreView.json"
 .h2o.__DOWNLOAD_LOGS = "LogDownload.json"
 .h2o.__DOMAIN_MAPPING = "2/DomainMapping.json"
+.h2o.__PAGE_ALLMODELS = "2/Models.json"
 
 .h2o.__PAGE_EXEC2 = "2/Exec2.json"
 .h2o.__PAGE_IMPORTFILES2 = "2/ImportFiles2.json"

diff --git a/R/h2o-package/R/ParseImport.R b/R/h2o-package/R/ParseImport.R
@@ -354,7 +354,7 @@ h2o.ignoreColumns <- function(data, max_na = 0.2) {
 
 
 # ------------------- Save H2O Model to Disk ----------------------------------------------------
-h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
+h2o.saveModel <- function(object, dir="", name="",save_cv=TRUE, force=FALSE) {
     if(missing(object)) stop('Must specify object')
     if(!inherits(object,'H2OModel')) stop('object must be an H2O model')
     if(!is.character(dir)) stop('path must be of class character')
@@ -369,7 +369,7 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
     # Create a model directory for each model saved that will include main model
     # any cross validation models and a meta text file with all the model names listed
     model_dir <- paste(dir, name, sep=.Platform$file.sep)
-    dir.create(model_dir)
+    dir.create(model_dir,showWarnings = F)
 
     # Save main model
     path <- paste(model_dir, object@key, sep=.Platform$file.sep)
@@ -378,12 +378,15 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
     # Save all cross validation models
     if (.hasSlot(object, "xval")) {
       xval_keys <- sapply(object@xval,function(model) model@key )
-      if(save_cv & (length(xval_keys)==0)) stop('No cross validation models found')
-      if(save_cv) for (xval_key in xval_keys) .h2o.__remoteSend(object@data@h2o, .h2o.__PAGE_SaveModel, model=xval_key, path=paste(model_dir, xval_key, sep=.Platform$file.sep), force=force)
+      if(save_cv & !(length(xval_keys)==0)) {
+        for (xval_key in xval_keys) .h2o.__remoteSend(object@data@h2o, .h2o.__PAGE_SaveModel, model=xval_key, path=paste(model_dir, xval_key, sep=.Platform$file.sep), force=force)
+      } else {
+        save_cv <- FALSE # do not save CV results if they do not exist
+      }
     } else {
-      save_cv <- FALSE  # do not save CV results if they do not exist
+      save_cv <- FALSE # if no xval slot (Naive Bayes) no CV models
     }
-    
+
     # Create new file called model_names and write all model names to file
     fileConn <- file(paste(model_dir, "model_names", sep=.Platform$file.sep))
     if(save_cv) {writeLines(text = c(object@key, xval_keys), con = fileConn)
@@ -395,6 +398,30 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=FALSE, force=FALSE) {
     dirname(res$path)
 }
 
+# ------------------- Save All H2O Model to Disk --------------------------------------------------
+
+h2o.saveAll <- function(object, dir="", save_cv=TRUE, force=FALSE) {
+  if(missing(object)) stop('Must specify object')
+  if(class(object) != 'H2OClient') stop('object must be of class H2OClient')
+
+  ## Grab all the model keys in H2O
+  res = .h2o.__remoteSend(client = object, page = .h2o.__PAGE_ALLMODELS)
+  keys = names(res$models)
+
+  ## Delete Duplicate Keys (this will avoid saving cross validation models multiple times for non-GLM models)
+  duplicates = {}
+  for(key in keys) { dups = grep(pattern = paste(key, "_", sep = ""), x = keys)
+    duplicates = append(x = duplicates, values = dups)
+  }
+  keys = keys[-duplicates]
+
+  ## Create H2OModel objects in R (To grab the cross validation models)
+  models = lapply(keys, function(model_key) h2o.getModel(h2o = object, key = model_key))
+  m_path = sapply(models, function(model_obj) h2o.saveModel(model_obj, dir=dir, save_cv=save_cv, force=force) )
+  m_path
+}
+
+
 # ------------------- Load H2O Model from Disk ----------------------------------------------------
 h2o.loadModel <- function(object, path="") {
     if(missing(object)) stop('Must specify object')
@@ -411,3 +438,22 @@ h2o.loadModel <- function(object, path="") {
     if(length(model_names)>0) for (key in model_names) .h2o.__remoteSend(object, .h2o.__PAGE_LoadModel, path = paste(path, key, sep=.Platform$file.sep) )
     h2o.getModel(object, model_names[1])
 }
+
+
+# ------------------- Load All H2O Model in a directory from Disk -----------------------------------------------
+h2o.loadAll <- function(object, dir="") {
+  if(missing(object)) stop('Must specify object')
+  if(class(object) != 'H2OClient') stop('object must be of class H2OClient')
+  if(!is.character(dir)) stop('dir must be of class character')
+
+  model_dirs = setdiff(list.dirs(dir), dir)
+  model_objs = {}
+  for(model_dir in model_dirs) {
+    print(paste("Loading ", basename(model_dir), "....",sep = ""))
+    temp_model = h2o.loadModel(object, path = model_dir)
+    model_objs = append(x = model_objs, values = temp_model)
+  }
+
+  model_objs
+}
+