Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/h2oai/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
arnocandel committed Jan 27, 2015
2 parents 07f07f5 + a81899c commit 9e8d975
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 37 deletions.
23 changes: 13 additions & 10 deletions R/h2o-package/R/Algorithms.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ h2o.coxph <- function(x, y, data, key = "", weights = NULL, offset = NULL,
n.censor = res[[3L]]$n_censor,
surv = NULL,
type = ifelse(ny == 2L, "right", "counting"),
cumhaz = NULL,
std.err = NULL,
cumhaz = res[[3L]]$cumhaz_0,
std.err = list(var_cumhaz_1 = res[[3L]]$var_cumhaz_1, var_cumhaz_2 = res[[3L]]$var_cumhaz_2),
upper = NULL,
lower = NULL,
conf.type = NULL,
Expand Down Expand Up @@ -1079,11 +1079,13 @@ h2o.pcr <- function(x, y, data, key = "", ncomp, family, nfolds = 10, alpha = 0.

# ----------------------------------- Random Forest --------------------------------- #
h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50, depth=20, mtries = -1, sample.rate=2/3,
nbins=20, seed=-1, importance=FALSE, nfolds=0, validation, holdout.fraction=0, nodesize=1,
balance.classes=FALSE, max.after.balance.size=5, class.sampling.factors = NULL, doGrpSplit=TRUE, verbose = FALSE,
oobee = TRUE, stat.type = "ENTROPY", type = "fast") {
nbins=20, seed=-1, importance=FALSE, score.each.iteration=FALSE, nfolds=0, validation,
holdout.fraction=0, nodesize=1, balance.classes=FALSE, max.after.balance.size=5,
class.sampling.factors = NULL, doGrpSplit=TRUE, verbose = FALSE, oobee = TRUE,
stat.type = "ENTROPY", type = "fast") {
if (type == "fast") {
if (!is.null(class.sampling.factors)) stop("class.sampling.factors requires type=BigData.")
if (!is.null(class.sampling.factors)) stop("class.sampling.factors requires type = 'BigData'.")
if(score.each.iteration) stop("score.each.iteration = TRUE requires type = 'BigData'")
return(h2o.SpeeDRF(x, y, data, key, classification, nfolds, validation, holdout.fraction, mtries, ntree, depth, sample.rate, oobee,
importance, nbins, seed, stat.type, balance.classes, verbose))
}
Expand All @@ -1103,6 +1105,7 @@ h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50,
if( any(nbins < 1)) stop('nbins must be an integer >= 1')
if(!is.numeric(seed)) stop("seed must be an integer >= 0")
if(!is.logical(importance)) stop("importance must be logical (TRUE or FALSE)')")
if(!is.logical(score.each.iteration)) stop("score.each.iteration must be logical (TRUE or FALSE)")

if(!is.logical(balance.classes)) stop('balance.classes must be logical (TRUE or FALSE)')
if(!is.numeric(max.after.balance.size)) stop('max.after.balance.size must be a number')
Expand All @@ -1122,16 +1125,16 @@ h2o.randomForest <- function(x, y, data, key="", classification=TRUE, ntree=50,
# NB: externally, 1 based indexing; internally, 0 based
cols <- paste(args$x_i - 1, collapse=',')
if(missing(validation) && nfolds == 0) {
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, mtries = mtries, seed=seed, importance=as.numeric(importance),
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, mtries = mtries, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
classification=as.numeric(classification), holdout_fraction = as.numeric(holdout.fraction), balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
} else if(missing(validation) && nfolds >= 2) {
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance),
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
classification=as.numeric(classification), n_folds=nfolds, balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
} else if(!missing(validation) && nfolds == 0) {
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance),
res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, destination_key=key, response=args$y, cols=cols, ntrees=ntree, mtries = mtries, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance), score_each_iteration=as.numeric(score.each.iteration),
classification=as.numeric(classification), validation=validation@key, balance_classes=as.numeric(balance.classes), max_after_balance_size=as.numeric(max.after.balance.size), class_sampling_factors = class.sampling.factors, do_grpsplit=as.numeric(doGrpSplit))
} else stop("Cannot set both validation and nfolds at the same time")
params = list(x=args$x, y=args$y, type="BigData", ntree=ntree, mtries = mtries, depth=depth, nbins=nbins, sample.rate=sample.rate, nbins=nbins, importance=importance, nfolds=nfolds, balance.classes=balance.classes, max.after.balance.size=max.after.balance.size, class.sampling.factors = class.sampling.factors, nodesize=nodesize, h2o = data@h2o)
params = list(x=args$x, y=args$y, type="BigData", ntree=ntree, mtries = mtries, depth=depth, nbins=nbins, sample.rate=sample.rate, nbins=nbins, importance=importance, score.each.iteration=score.each.iteration, nfolds=nfolds, balance.classes=balance.classes, max.after.balance.size=max.after.balance.size, class.sampling.factors = class.sampling.factors, nodesize=nodesize, h2o = data@h2o)

if(.is_singlerun("RF", params))
.h2o.singlerun.internal("RF", data, res, nfolds, validation, params)
Expand Down
13 changes: 13 additions & 0 deletions R/h2o-package/R/Classes.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,19 @@ function(formula, newdata, conf.int = 0.95,
capture.output(newdata <- as.h2o(formula@data@h2o, newdata, header = TRUE))
conf.type <- match.arg(conf.type)

# Code below has calculation performed in R
pred <- as.data.frame(h2o.predict(formula, newdata))[[1L]]
res <- formula@survfit
if (length(pred) == 1L)
res$cumhaz <- pred * res$cumhaz
else
res$cumhaz <- outer(res$cumhaz, pred, FUN = "*")
res$std.err <- NULL
res$surv <- exp(- res$cumhaz)
class(res) <- c("survfit.H2OCoxPHModel", "survfit.cox", "survfit")
return(res)

# Code below assumes calculation in H2O
pred <- as.matrix(h2o.predict(formula, newdata)[,-1L])
nms <- colnames(pred)
dimnames(pred) <- NULL
Expand Down
3 changes: 2 additions & 1 deletion R/h2o-package/demo/h2o.kmeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
prostate.hex = h2o.uploadFile(localH2O, path = system.file("extdata", "prostate.csv", package="h2o"), key = "prostate.hex")
summary(prostate.hex)
prostate.km = h2o.kmeans(prostate.hex, centers = 10, cols = c("AGE","RACE","GLEASON","CAPSULE","DCAPS"))
prostate.pred = h2o.predict(object = prostate.km, newdata = prostate.hex)
print(prostate.km)

prostate.data = as.data.frame(prostate.hex)
prostate.clus = as.data.frame(prostate.km@model$cluster)
prostate.clus = as.data.frame(prostate.pred)

# Plot categorized data
# if(!"fpc" %in% rownames(installed.packages())) install.packages("fpc")
Expand Down
11 changes: 7 additions & 4 deletions R/h2o-package/man/h2o.randomForest.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ Performs random forest classification on a data set.
\usage{
h2o.randomForest(x, y, data, key = "", classification = TRUE, ntree = 50,
depth = 20, mtries = -1, sample.rate = 2/3, nbins = 20, seed = -1,
importance = FALSE, nfolds = 0, validation, holdout.fraction = 0, nodesize = 1,
balance.classes = FALSE, max.after.balance.size = 5, class.sampling.factors = NULL,
doGrpSplit = TRUE, verbose = FALSE, oobee = TRUE, stat.type = "ENTROPY",
type = "fast")
importance = FALSE, score.each.iteration = FALSE, nfolds = 0, validation,
holdout.fraction = 0, nodesize = 1, balance.classes = FALSE,
max.after.balance.size = 5, class.sampling.factors = NULL, doGrpSplit = TRUE,
verbose = FALSE, oobee = TRUE, stat.type = "ENTROPY", type = "fast")
}
%- maybe also 'usage' for other objects documented here.
\arguments{
Expand Down Expand Up @@ -55,6 +55,9 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
(Optional) A logical value indicating whether to calculate variable importance. Set to \code{FALSE} to speed
up computations.
}
\item{score.each.iteration}{
(Optional) A logical value indicating whether to perform scoring after every iteration. Set to \code{FALSE} to speed up computations. Note that this can only be set to \code{TRUE} if \code{type = "BigData"}.
}
\item{nfolds}{
(Optional) Number of folds for cross-validation. If \code{nfolds >= 2}, then \code{validation} must remain empty.
}
Expand Down
20 changes: 10 additions & 10 deletions R/tests/Utils/coxphR.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,16 @@ checkCoxPHSurvfit <- function(survfitCoxPH.h2o, survfitCoxPH.r, tolerance = 1e-8
checkEquals(survfitCoxPH.r$type, survfitCoxPH.h2o$type)
checkEquals(survfitCoxPH.r$cumhaz, survfitCoxPH.h2o$cumhaz,
tolerance = sqrt(tolerance))
checkEquals(survfitCoxPH.r$std.err, survfitCoxPH.h2o$std.err,
tolerance = sqrt(tolerance))
checkEquals(survfitCoxPH.r$upper, survfitCoxPH.h2o$upper,
tolerance = sqrt(tolerance),
check.attributes = FALSE)
checkEquals(survfitCoxPH.r$lower, survfitCoxPH.h2o$lower,
tolerance = sqrt(tolerance),
check.attributes = FALSE)
checkEquals(survfitCoxPH.r$conf.type, survfitCoxPH.h2o$conf.type)
checkEquals(survfitCoxPH.r$conf.int, survfitCoxPH.h2o$conf.int)
#checkEquals(survfitCoxPH.r$std.err, survfitCoxPH.h2o$std.err,
# tolerance = sqrt(tolerance))
#checkEquals(survfitCoxPH.r$upper, survfitCoxPH.h2o$upper,
# tolerance = sqrt(tolerance),
# check.attributes = FALSE)
#checkEquals(survfitCoxPH.r$lower, survfitCoxPH.h2o$lower,
# tolerance = sqrt(tolerance),
# check.attributes = FALSE)
#checkEquals(survfitCoxPH.r$conf.type, survfitCoxPH.h2o$conf.type)
#checkEquals(survfitCoxPH.r$conf.int, survfitCoxPH.h2o$conf.int)

invisible(TRUE)
}
4 changes: 2 additions & 2 deletions h2o-docs/source/faq/hadoop_related.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
Hadoop FAQ
==========

***************
****************
Accessing Logs
***************
****************

Depending on whether you are using Hadoop with H2O and whether the job is currently running, there are different ways of obtaining the logs for H2O.

Expand Down
33 changes: 23 additions & 10 deletions packaging/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@
#documentation {
border-top: 1px solid #ddd;
}
#get-started {
margin-top: 15px;
border-top: 1px solid #ddd;
}

</style>
<script>
Expand Down Expand Up @@ -276,16 +280,25 @@ <h1><span>H<sub>2</sub>O</span></h1>
<div id="downloads">
<a id="dzip" class="download-button" href="h2o-SUBST_PROJECT_VERSION.zip">Download H<sub>2</sub>O</a>
</div>
<h2>Get started with H<sub>2</sub>O in 3 easy steps</h2>
<p>1. Download H<sub>2</sub>O. This is a zip file that contains everything you need to get started.</p>
<p>2. From your terminal, run:</p>
<p class="terminal">
cd ~/Downloads<br/>
unzip h2o-SUBST_PROJECT_VERSION.zip<br/>
cd h2o-SUBST_PROJECT_VERSION<br/>
java -jar h2o.jar<br/>
</p>
<p>3. Point your browser to <a href="http://localhost:54321/" target="_blank">http://localhost:54321</a>
<div id="license">
<h2>License</h2>
<ul>
<li>Everything in this version of H<sub>2</sub>O (including H<sub>2</sub>O for R and Hadoop) is licensed under the <a id="license-h2o" href="http://www.apache.org/licenses/LICENSE-2.0">Apache V2 License</a>.</li>
<li>Generated Java scoring POJOs are also licensed under the <a id="license-scoring" href="http://www.apache.org/licenses/LICENSE-2.0">Apache V2 License</a> (since the Mirzakhani release).</li>
</ul>
</div>
<div id="get-started">
<h2>Get started with H<sub>2</sub>O in 3 easy steps</h2>
<p>1. Download H<sub>2</sub>O. This is a zip file that contains everything you need to get started.</p>
<p>2. From your terminal, run:</p>
<p class="terminal">
cd ~/Downloads<br/>
unzip h2o-SUBST_PROJECT_VERSION.zip<br/>
cd h2o-SUBST_PROJECT_VERSION<br/>
java -jar h2o.jar<br/>
</p>
<p>3. Point your browser to <a href="http://localhost:54321/" target="_blank">http://localhost:54321</a>
</div>
</div>
<div id="quickstart-r" style="display:none">
<h2>Use H<sub>2</sub>O directly from R</h2>
Expand Down
41 changes: 41 additions & 0 deletions src/main/java/hex/CoxPH.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ public CoxPHModel(CoxPH job, Key selfKey, Key dataKey, Frame fr, float[] priorCl
public float progress() { return (float) iter / (float) get_params().iter_max; }

// Following three overrides created for use in super.scoreImpl
/*
@Override
public String[] classNames() {
final String[] names = new String[nclasses()];
Expand Down Expand Up @@ -236,6 +237,46 @@ protected float[] score0(double[] data, float[] preds) {
preds[0] = Float.NaN;
return preds;
}
*/

@Override
protected float[] score0(double[] data, float[] preds) {
final int n_offsets = (parameters.offset_columns == null) ? 0 : parameters.offset_columns.length;
final int n_cats = data_info._cats;
final int n_nums = data_info._nums;
final int n_data = n_cats + n_nums;
final int numStart = data_info.numStart();
final int n_non_offsets = n_nums - n_offsets;
boolean catsAllNA = true;
boolean catsHasNA = false;
boolean numsHasNA = false;
for (int j = 0; j < n_cats; ++j) {
catsAllNA &= Double.isNaN(data[j]);
catsHasNA |= Double.isNaN(data[j]);
}
for (int j = n_cats; j < n_data; ++j)
numsHasNA |= Double.isNaN(data[j]);
if (numsHasNA || (catsHasNA && !catsAllNA)) {
preds[0] = Float.NaN;
} else {
double logRisk = 0;
for (int j = 0; j < n_cats; ++j) {
final int k_start = data_info._catOffsets[j];
final int k_end = data_info._catOffsets[j + 1];
if (Double.isNaN(data[j]))
for (int k = k_start; k < k_end; ++k)
logRisk += x_mean_cat[k] * coef[k];
else if (data[j] != 0)
logRisk += coef[k_start + (int) (data[j] - 1)];
}
for (int j = 0; j < n_non_offsets; ++j)
logRisk += (data[n_cats + j] - data_info._normSub[j]) * coef[numStart + j];
for (int j = n_non_offsets; j < n_nums; ++j)
logRisk += (data[n_cats + j] - data_info._normSub[j]);
preds[0] = (float) Math.exp(logRisk);
}
return preds;
}

protected void initStats(final Frame source, final DataInfo dinfo) {
n = source.numRows();
Expand Down

0 comments on commit 9e8d975

Please sign in to comment.