Merge branch 'master' of https://github.com/h2oai/h2o

AlicTsai · Jan 27, 2015 · 9e8d975 · 9e8d975
2 parents 07f07f5 + a81899c
commit 9e8d975
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 37 deletions.
diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R
@@ -125,8 +125,8 @@ h2o.coxph <- function(x, y, data, key = "", weights = NULL, offset = NULL,
          n.censor     = res[[3L]]$n_censor,
          surv         = NULL,
          type         = ifelse(ny == 2L, "right", "counting"),
-         cumhaz       = NULL,
-         std.err      = NULL,
+         cumhaz       = res[[3L]]$cumhaz_0,
+         std.err      = list(var_cumhaz_1 = res[[3L]]$var_cumhaz_1, var_cumhaz_2 = res[[3L]]$var_cumhaz_2),
          upper        = NULL,
          lower        = NULL,
          conf.type    = NULL,
@@ -1079,11 +1079,13 @@ h2o.pcr <- function(x, y, data, key = "", ncomp, family, nfolds = 10, alpha = 0.
 
 # ----------------------------------- Random Forest --------------------------------- #
 h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50, depth=20, mtries = -1, sample.rate=2/3,
-                             nbins=20, seed=-1, importance=FALSE, nfolds=0, validation, holdout.fraction=0, nodesize=1,
-                             balance.classes=FALSE, max.after.balance.size=5, class.sampling.factors = NULL, doGrpSplit=TRUE, verbose = FALSE,
-                             oobee = TRUE, stat.type = "ENTROPY", type = "fast") {
+                             nbins=20, seed=-1, importance=FALSE, score.each.iteration=FALSE, nfolds=0, validation, 
+                             holdout.fraction=0, nodesize=1, balance.classes=FALSE, max.after.balance.size=5, 
+                             class.sampling.factors = NULL, doGrpSplit=TRUE, verbose = FALSE, oobee = TRUE, 
+                             stat.type = "ENTROPY", type = "fast") {
   if (type == "fast") {
-    if (!is.null(class.sampling.factors)) stop("class.sampling.factors requires type=BigData.")
+    if (!is.null(class.sampling.factors)) stop("class.sampling.factors requires type = 'BigData'.")
+    if(score.each.iteration) stop("score.each.iteration = TRUE requires type = 'BigData'")
     return(h2o.SpeeDRF(x, y, data, key, classification, nfolds, validation, holdout.fraction, mtries, ntree, depth, sample.rate, oobee,
                        importance, nbins, seed, stat.type, balance.classes, verbose))
   }
@@ -1103,6 +1105,7 @@ h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50,
   if( any(nbins < 1)) stop('nbins must be an integer >= 1')
   if(!is.numeric(seed)) stop("seed must be an integer >= 0")
   if(!is.logical(importance)) stop("importance must be logical (TRUE or FALSE)')")
+  if(!is.logical(score.each.iteration)) stop("score.each.iteration must be logical (TRUE or FALSE)")
 
   if(!is.logical(balance.classes)) stop('balance.classes must be logical (TRUE or FALSE)')
   if(!is.numeric(max.after.balance.size)) stop('max.after.balance.size must be a number')
@@ -1122,16 +1125,16 @@ h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50,
   # NB: externally, 1 based indexing; internally, 0 based
   cols <- paste(args$x_i - 1, collapse=',')
   if(missing(validation) && nfolds == 0) {
-    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, mtries = mtries, seed=seed, importance=as.numeric(importance),
+    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, mtries = mtries, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
                             classification=as.numeric(classification), holdout_fraction = as.numeric(holdout.fraction), balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
   } else if(missing(validation) && nfolds >= 2) {
-    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance),
+    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
                             classification=as.numeric(classification), n_folds=nfolds, balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
   } else if(!missing(validation) && nfolds == 0) {
-    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance),
+    res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
                             classification=as.numeric(classification), validation=validation@key, balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
   } else stop("Cannot set both validation and nfolds at the same time")
-  params = list(x=args$x, y=args$y, type="BigData", ntree=ntree, mtries = mtries, depth=depth, nbins=nbins, sample.rate=sample.rate, nbins=nbins, importance=importance, nfolds=nfolds, balance.classes=balance.classes, max.after.balance.size=max.after.balance.size, class.sampling.factors = class.sampling.factors, nodesize=nodesize, h2o = data@h2o)
+  params = list(x=args$x, y=args$y, type="BigData", ntree=ntree, mtries = mtries, depth=depth, nbins=nbins, sample.rate=sample.rate, nbins=nbins, importance=importance, score.each.iteration=score.each.iteration, nfolds=nfolds, balance.classes=balance.classes, max.after.balance.size=max.after.balance.size, class.sampling.factors = class.sampling.factors, nodesize=nodesize, h2o = data@h2o)
 
   if(.is_singlerun("RF", params))
     .h2o.singlerun.internal("RF", data, res, nfolds, validation, params)

diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R
@@ -214,6 +214,19 @@ function(formula, newdata, conf.int = 0.95,
     capture.output(newdata <- as.h2o(formula@data@h2o, newdata, header = TRUE))
   conf.type <- match.arg(conf.type)
 
+  # Code below has calculation performed in R
+  pred <- as.data.frame(h2o.predict(formula, newdata))[[1L]]
+  res <- formula@survfit
+  if (length(pred) == 1L)
+    res$cumhaz <- pred * res$cumhaz
+  else
+    res$cumhaz <- outer(res$cumhaz, pred, FUN = "*")
+  res$std.err <- NULL
+  res$surv <- exp(- res$cumhaz)
+  class(res) <- c("survfit.H2OCoxPHModel", "survfit.cox", "survfit")
+  return(res)
+
+  # Code below assumes calculation in H2O
   pred <- as.matrix(h2o.predict(formula, newdata)[,-1L])
   nms <- colnames(pred)
   dimnames(pred) <- NULL

diff --git a/R/h2o-package/demo/h2o.kmeans.R b/R/h2o-package/demo/h2o.kmeans.R
@@ -8,10 +8,11 @@ localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
 prostate.hex = h2o.uploadFile(localH2O, path = system.file("extdata", "prostate.csv", package="h2o"), key = "prostate.hex")
 summary(prostate.hex)
 prostate.km = h2o.kmeans(prostate.hex, centers = 10, cols = c("AGE","RACE","GLEASON","CAPSULE","DCAPS"))
+prostate.pred = h2o.predict(object = prostate.km, newdata = prostate.hex)
 print(prostate.km)
 
 prostate.data = as.data.frame(prostate.hex)
-prostate.clus = as.data.frame(prostate.km@model$cluster)
+prostate.clus = as.data.frame(prostate.pred)
 
 # Plot categorized data
 # if(!"fpc" %in% rownames(installed.packages())) install.packages("fpc")

diff --git a/R/h2o-package/man/h2o.randomForest.Rd b/R/h2o-package/man/h2o.randomForest.Rd
@@ -9,10 +9,10 @@ Performs random forest classification on a data set.
 \usage{
 h2o.randomForest(x, y, data, key = "", classification = TRUE, ntree = 50, 
   depth = 20, mtries = -1, sample.rate = 2/3, nbins = 20, seed = -1,
-  importance = FALSE, nfolds = 0, validation, holdout.fraction = 0, nodesize = 1,
-  balance.classes = FALSE, max.after.balance.size = 5, class.sampling.factors = NULL,
-  doGrpSplit = TRUE, verbose = FALSE, oobee = TRUE, stat.type = "ENTROPY",
-  type = "fast")
+  importance = FALSE, score.each.iteration = FALSE, nfolds = 0, validation, 
+  holdout.fraction = 0, nodesize = 1, balance.classes = FALSE, 
+  max.after.balance.size = 5, class.sampling.factors = NULL, doGrpSplit = TRUE, 
+  verbose = FALSE, oobee = TRUE, stat.type = "ENTROPY", type = "fast")
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -55,6 +55,9 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
   (Optional) A logical value indicating whether to calculate variable importance. Set to \code{FALSE} to speed
   up computations.
   }
+  \item{score.each.iteration}{
+  (Optional) A logical value indicating whether to perform scoring after every iteration. Set to \code{FALSE} to speed up computations. Note that this can only be set to \code{TRUE} if \code{type = "BigData"}.
+  }
   \item{nfolds}{
   (Optional) Number of folds for cross-validation. If \code{nfolds >= 2}, then \code{validation} must remain empty.
 }

diff --git a/R/tests/Utils/coxphR.R b/R/tests/Utils/coxphR.R
@@ -112,16 +112,16 @@ checkCoxPHSurvfit <- function(survfitCoxPH.h2o, survfitCoxPH.r, tolerance = 1e-8
   checkEquals(survfitCoxPH.r$type,      survfitCoxPH.h2o$type)
   checkEquals(survfitCoxPH.r$cumhaz,    survfitCoxPH.h2o$cumhaz,
               tolerance = sqrt(tolerance))
-  checkEquals(survfitCoxPH.r$std.err,   survfitCoxPH.h2o$std.err,
-              tolerance = sqrt(tolerance))
-  checkEquals(survfitCoxPH.r$upper,     survfitCoxPH.h2o$upper,
-              tolerance = sqrt(tolerance),
-              check.attributes = FALSE)
-  checkEquals(survfitCoxPH.r$lower,     survfitCoxPH.h2o$lower,
-              tolerance = sqrt(tolerance),
-              check.attributes = FALSE)
-  checkEquals(survfitCoxPH.r$conf.type, survfitCoxPH.h2o$conf.type)
-  checkEquals(survfitCoxPH.r$conf.int,  survfitCoxPH.h2o$conf.int)
+  #checkEquals(survfitCoxPH.r$std.err,   survfitCoxPH.h2o$std.err,
+  #            tolerance = sqrt(tolerance))
+  #checkEquals(survfitCoxPH.r$upper,     survfitCoxPH.h2o$upper,
+  #            tolerance = sqrt(tolerance),
+  #            check.attributes = FALSE)
+  #checkEquals(survfitCoxPH.r$lower,     survfitCoxPH.h2o$lower,
+  #            tolerance = sqrt(tolerance),
+  #            check.attributes = FALSE)
+  #checkEquals(survfitCoxPH.r$conf.type, survfitCoxPH.h2o$conf.type)
+  #checkEquals(survfitCoxPH.r$conf.int,  survfitCoxPH.h2o$conf.int)
 
   invisible(TRUE)
 }
diff --git a/h2o-docs/source/faq/hadoop_related.rst b/h2o-docs/source/faq/hadoop_related.rst
@@ -4,9 +4,9 @@
 Hadoop FAQ
 ==========
 
-***************
+****************
 Accessing Logs
-***************
+****************
 
 Depending on whether you are using Hadoop with H2O and whether the job is currently running, there are different ways of obtaining the logs for H2O. 
 

diff --git a/packaging/index.html b/packaging/index.html
@@ -178,6 +178,10 @@
       #documentation {
         border-top: 1px solid #ddd;
       }
+      #get-started {
+        margin-top: 15px;
+        border-top: 1px solid #ddd;
+      }
 
     </style>
     <script>
@@ -276,16 +280,25 @@ <h1><span>H<sub>2</sub>O</span></h1>
             <div id="downloads">
               <a id="dzip" class="download-button" href="h2o-SUBST_PROJECT_VERSION.zip">Download H<sub>2</sub>O</a>
             </div>
-            <h2>Get started with H<sub>2</sub>O in 3 easy steps</h2>
-            <p>1. Download H<sub>2</sub>O. This is a zip file that contains everything you need to get started.</p>
-            <p>2. From your terminal, run:</p>
-            <p class="terminal">
-              cd ~/Downloads<br/>
-              unzip h2o-SUBST_PROJECT_VERSION.zip<br/>
-              cd h2o-SUBST_PROJECT_VERSION<br/>
-              java -jar h2o.jar<br/>
-            </p>
-            <p>3. Point your browser to <a href="http://localhost:54321/" target="_blank">http://localhost:54321</a>
+            <div id="license">
+              <h2>License</h2>
+              <ul>
+                <li>Everything in this version of H<sub>2</sub>O (including H<sub>2</sub>O for R and Hadoop) is licensed under the <a id="license-h2o" href="http://www.apache.org/licenses/LICENSE-2.0">Apache V2 License</a>.</li>
+                <li>Generated Java scoring POJOs are also licensed under the <a id="license-scoring" href="http://www.apache.org/licenses/LICENSE-2.0">Apache V2 License</a> (since the Mirzakhani release).</li>
+              </ul>
+            </div>
+            <div id="get-started">
+              <h2>Get started with H<sub>2</sub>O in 3 easy steps</h2>
+              <p>1. Download H<sub>2</sub>O. This is a zip file that contains everything you need to get started.</p>
+              <p>2. From your terminal, run:</p>
+              <p class="terminal">
+                cd ~/Downloads<br/>
+                unzip h2o-SUBST_PROJECT_VERSION.zip<br/>
+                cd h2o-SUBST_PROJECT_VERSION<br/>
+                java -jar h2o.jar<br/>
+              </p>
+              <p>3. Point your browser to <a href="http://localhost:54321/" target="_blank">http://localhost:54321</a>
+            </div>
           </div>
           <div id="quickstart-r" style="display:none">
             <h2>Use H<sub>2</sub>O directly from R</h2>

diff --git a/src/main/java/hex/CoxPH.java b/src/main/java/hex/CoxPH.java
@@ -165,6 +165,7 @@ public CoxPHModel(CoxPH job, Key selfKey, Key dataKey, Frame fr, float[] priorCl
     public float progress() { return (float) iter / (float) get_params().iter_max; }
 
     // Following three overrides created for use in super.scoreImpl
+    /*
     @Override
     public String[] classNames() {
       final String[] names = new String[nclasses()];
@@ -236,6 +237,46 @@ protected float[] score0(double[] data, float[] preds) {
       preds[0] = Float.NaN;
       return preds;
     }
+    */
+
+    @Override
+    protected float[] score0(double[] data, float[] preds) {
+      final int n_offsets = (parameters.offset_columns == null) ? 0 : parameters.offset_columns.length;
+      final int n_cats    = data_info._cats;
+      final int n_nums    = data_info._nums;
+      final int n_data    = n_cats + n_nums;
+      final int numStart  = data_info.numStart();
+      final int n_non_offsets = n_nums - n_offsets;
+      boolean catsAllNA   = true;
+      boolean catsHasNA   = false;
+      boolean numsHasNA   = false;
+      for (int j = 0; j < n_cats; ++j) {
+        catsAllNA &= Double.isNaN(data[j]);
+        catsHasNA |= Double.isNaN(data[j]);
+      }
+      for (int j = n_cats; j < n_data; ++j)
+        numsHasNA |= Double.isNaN(data[j]);
+      if (numsHasNA || (catsHasNA && !catsAllNA)) {
+        preds[0] = Float.NaN;
+      } else {
+        double logRisk = 0;
+        for (int j = 0; j < n_cats; ++j) {
+          final int k_start = data_info._catOffsets[j];
+          final int k_end   = data_info._catOffsets[j + 1];
+          if (Double.isNaN(data[j]))
+            for (int k = k_start; k < k_end; ++k)
+              logRisk += x_mean_cat[k] * coef[k];
+          else if (data[j] != 0)
+            logRisk += coef[k_start + (int) (data[j] - 1)];
+        }
+        for (int j = 0; j < n_non_offsets; ++j)
+          logRisk += (data[n_cats + j] - data_info._normSub[j]) * coef[numStart + j];
+        for (int j = n_non_offsets; j < n_nums; ++j)
+          logRisk += (data[n_cats + j] - data_info._normSub[j]);
+        preds[0] = (float) Math.exp(logRisk);
+      }
+      return preds;
+    }
 
     protected void initStats(final Frame source, final DataInfo dinfo) {
       n = source.numRows();