Merge branch 'master' of github.com:0xdata/h2o

nuoncul · Jul 16, 2014 · 50bc74b · 50bc74b
2 parents a4c443d + 1f40457
commit 50bc74b
Show file tree

Hide file tree

Showing 10 changed files with 172 additions and 111 deletions.
diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R
diff --git a/R/h2o-package/man/h2o.SpeeDRF.Rd b/R/h2o-package/man/h2o.SpeeDRF.Rd
@@ -7,10 +7,10 @@ H2O: Single-Node Random Forest
 Performs single-node random forest classification on a data set.
 }
 \usage{
-h2o.SpeeDRF(x, y, data, classification = TRUE, nfolds = 0, validation, mtry = -1, 
-  ntree = 50, depth = 50, sample.rate = 2/3, oobee = TRUE, importance = FALSE, 
-  nbins = 1024, seed = -1, stat.type = "ENTROPY", balance.classes = FALSE, 
-  verbose = FALSE)
+h2o.SpeeDRF(x, y, data, key = "", classification = TRUE, nfolds = 0, validation, 
+  mtry = -1, ntree = 50, depth = 50, sample.rate = 2/3, oobee = TRUE, 
+  importance = FALSE, nbins = 1024, seed = -1, stat.type = "ENTROPY", 
+  balance.classes = FALSE, verbose = FALSE)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -22,6 +22,9 @@ The name or index of the response variable. If the data does not contain a heade
 }
   \item{data}{
 An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
 }
   \item{classification}{
 (Optional) A logical value indicating whether a classification model should be built (as opposed to regression).

diff --git a/R/h2o-package/man/h2o.deeplearning.Rd b/R/h2o-package/man/h2o.deeplearning.Rd
@@ -7,8 +7,8 @@ H2O: Deep Learning Neural Networks
 Performs Deep Learning neural networks on an \code{\linkS4class{H2OParsedData}} object.
 }
 \usage{
-h2o.deeplearning(x, y, data, classification = TRUE, nfolds = 0, validation,
-    autoencoder, use_all_factor_levels,
+h2o.deeplearning(x, y, data, key = "", classification = TRUE, nfolds = 0, 
+    validation, autoencoder, use_all_factor_levels,
     activation, hidden, epochs, train_samples_per_iteration, seed, adaptive_rate,
     rho, epsilon, rate, rate_annealing, rate_decay, momentum_start,
     momentum_ramp, momentum_stable, nesterov_accelerated_gradient,
@@ -28,6 +28,7 @@ h2o.deeplearning(x, y, data, classification = TRUE, nfolds = 0, validation,
         \item{x}{ A vector containing the names of the predictors in the model. }
         \item{y}{ The name of the response variable in the model. }
         \item{data}{ An \code{\linkS4class{H2OParsedData}} object containing the variables in the model. }
+          \item{key}{ (Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.}
         \item{classification}{ (Optional) A logical value indicating whether the algorithm should conduct classification. }
         \item{nfolds}{(Optional) Number of folds for cross-validation. If \code{nfolds >= 2}, then \code{validation} must remain empty.}
         \item{validation}{(Optional) An \code{\linkS4class{H2OParsedData}} object indicating the validation dataset used to construct confusion matrix. If left blank, this defaults to the training data when \code{nfolds = 0}.}

diff --git a/R/h2o-package/man/h2o.gbm.Rd b/R/h2o-package/man/h2o.gbm.Rd
@@ -10,9 +10,10 @@ H2O: Gradient Boosted Machines
 	 on a parsed data set.
 }
 \usage{
-h2o.gbm(x, y, distribution = "multinomial", data, n.trees = 10, interaction.depth = 5, 
-  n.minobsinnode = 10, shrinkage = 0.1, n.bins = 100, importance = FALSE, nfolds = 0,
-  validation, balance.classes = FALSE, max.after.balance.size = 5)
+h2o.gbm(x, y, distribution = "multinomial", data, key = "", n.trees = 10, 
+  interaction.depth = 5, n.minobsinnode = 10, shrinkage = 0.1, n.bins = 100, 
+  importance = FALSE, nfolds = 0, validation, balance.classes = FALSE, 
+  max.after.balance.size = 5)
 }
 \arguments{
   \item{x}{
@@ -26,6 +27,9 @@ The type of GBM model to be produced: classification is "multinomial" (default),
 }
   \item{data}{
 An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
 }
   \item{n.trees}{
 (Optional) Number of trees to grow. Must be a nonnegative integer.

diff --git a/R/h2o-package/man/h2o.glm.Rd b/R/h2o-package/man/h2o.glm.Rd
@@ -7,11 +7,12 @@ H2O: Generalized Linear Models
 Fit a generalized linear model, specified by a response variable, a set of predictors, and a description of the error distribution.
 }
 \usage{
-h2o.glm(x, y, data, family, nfolds = 0, alpha = 0.5, nlambda = -1, lambda.min.ratio = -1, 
-  lambda = 1e-5, epsilon = 1e-4, standardize = TRUE, prior, variable_importances = 1, 
-  use_all_factor_levels = 0, tweedie.p = ifelse(family == 'tweedie', 1.5, 
-  as.numeric(NA)), iter.max = 100, higher_accuracy = FALSE, lambda_search = FALSE, 
-  return_all_lambda = FALSE, max_predictors = -1)
+h2o.glm(x, y, data, key = "", family, nfolds = 0, alpha = 0.5, nlambda = -1, 
+  lambda.min.ratio = -1, lambda = 1e-5, epsilon = 1e-4, standardize = TRUE, 
+  prior, variable_importances = 1, use_all_factor_levels = 0, tweedie.p = 
+  ifelse(family == 'tweedie', 1.5, as.numeric(NA)), iter.max = 100, 
+  higher_accuracy = FALSE, lambda_search = FALSE, return_all_lambda = FALSE, 
+  max_predictors = -1)
 }
 \arguments{
   \item{x}{
@@ -22,6 +23,9 @@ The name of the response variable in the model.
 }
   \item{data}{
 An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
 }
   \item{family}{
 A description of the error distribution and corresponding link function to be used in the model. Currently, Gaussian, binomial, Poisson, gamma, and Tweedie are supported. When a model is specified as Tweedie, users must also specify the appropriate Tweedie power. 

diff --git a/R/h2o-package/man/h2o.kmeans.Rd b/R/h2o-package/man/h2o.kmeans.Rd
@@ -1,58 +1,61 @@
-\name{h2o.kmeans}
-\alias{h2o.kmeans}
-\title{
-H2O: K-Means Clustering
-}
-\description{Performs k-means clustering on a data set.}
-\usage{
-h2o.kmeans(data, centers, cols = "", iter.max = 10, normalize = FALSE, 
-  init = "none", seed = 0, dropNACols = FALSE)
-}
-%- maybe also 'usage' for other objects documented here.
-\arguments{
-  \item{data}{
-An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
-}
-  \item{centers}{
-The number of clusters k.
-}
-  \item{cols}{
-(Optional) A vector containing the names of the data columns on which k-means runs. If blank, k-means clustering will be run on the entire data set.
-}
-  \item{iter.max}{
-(Optional) The maximum number of iterations allowed.
-}
-  \item{normalize}{
-(Optional) A logical value indicating whether the data should be normalized before running k-means.
-}
-  \item{init}{
-(Optional) Method by which to select the k initial cluster centroids. Possible values are \code{"none"} for random initialization, \code{"plusplus"} for k-means++ initialization, and \code{"furthest"} for initialization at the furthest point from each successive centroid. See the \href{http://docs.0xdata.com/datascience/kmeans.html}{H2O K-means documentation} for more details.
-  }
-  \item{seed}{
-  (Optional) Random seed used to initialize the cluster centroids.
-  }
-  \item{dropNACols}{
-  (Optional) A logical value indicating whether to drop columns with more than 10\% entries that are NA.
-  }
-}
-\value{
-An object of class \code{\linkS4class{H2OKMeansModel}} with slots key, data, and model, where the last is a list of the following components:
-  \item{centers }{A matrix of cluster centers.}
-  \item{cluster }{A \code{\linkS4class{H2OParsedData}} object containing the vector of integers (from 1 to k), which indicate the cluster to which each point is allocated.}
-  \item{size }{The number of points in each cluster.}
-  \item{withinss }{Vector of within-cluster sum of squares, with one component per cluster.}
-  \item{tot.withinss }{Total within-cluster sum of squares, i.e., sum(withinss).}
-}
-
-\seealso{
-%% ~~objects to See Also as \code{\link{help}}, ~~~
-\code{\link{h2o.importFile}, \link{h2o.importFolder}, \link{h2o.importHDFS}, \link{h2o.importURL}, \link{h2o.uploadFile}}
-}
-
-\examples{
-library(h2o)
-localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
-prosPath = system.file("extdata", "prostate.csv", package = "h2o")
-prostate.hex = h2o.importFile(localH2O, path = prosPath)
-h2o.kmeans(data = prostate.hex, centers = 10, cols = c("AGE", "RACE", "VOL", "GLEASON"))
-}
+\name{h2o.kmeans}
+\alias{h2o.kmeans}
+\title{
+H2O: K-Means Clustering
+}
+\description{Performs k-means clustering on a data set.}
+\usage{
+h2o.kmeans(data, centers, cols = "", key = "", iter.max = 10, 
+  normalize = FALSE, init = "none", seed = 0, dropNACols = FALSE)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{data}{
+An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
+}
+  \item{centers}{
+The number of clusters k.
+}
+  \item{cols}{
+(Optional) A vector containing the names of the data columns on which k-means runs. If blank, k-means clustering will be run on the entire data set.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
+}
+  \item{iter.max}{
+(Optional) The maximum number of iterations allowed.
+}
+  \item{normalize}{
+(Optional) A logical value indicating whether the data should be normalized before running k-means.
+}
+  \item{init}{
+(Optional) Method by which to select the k initial cluster centroids. Possible values are \code{"none"} for random initialization, \code{"plusplus"} for k-means++ initialization, and \code{"furthest"} for initialization at the furthest point from each successive centroid. See the \href{http://docs.0xdata.com/datascience/kmeans.html}{H2O K-means documentation} for more details.
+  }
+  \item{seed}{
+  (Optional) Random seed used to initialize the cluster centroids.
+  }
+  \item{dropNACols}{
+  (Optional) A logical value indicating whether to drop columns with more than 10\% entries that are NA.
+  }
+}
+\value{
+An object of class \code{\linkS4class{H2OKMeansModel}} with slots key, data, and model, where the last is a list of the following components:
+  \item{centers }{A matrix of cluster centers.}
+  \item{cluster }{A \code{\linkS4class{H2OParsedData}} object containing the vector of integers (from 1 to k), which indicate the cluster to which each point is allocated.}
+  \item{size }{The number of points in each cluster.}
+  \item{withinss }{Vector of within-cluster sum of squares, with one component per cluster.}
+  \item{tot.withinss }{Total within-cluster sum of squares, i.e., sum(withinss).}
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+\code{\link{h2o.importFile}, \link{h2o.importFolder}, \link{h2o.importHDFS}, \link{h2o.importURL}, \link{h2o.uploadFile}}
+}
+
+\examples{
+library(h2o)
+localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
+prosPath = system.file("extdata", "prostate.csv", package = "h2o")
+prostate.hex = h2o.importFile(localH2O, path = prosPath)
+h2o.kmeans(data = prostate.hex, centers = 10, cols = c("AGE", "RACE", "VOL", "GLEASON"))
+}
diff --git a/R/h2o-package/man/h2o.naiveBayes.Rd b/R/h2o-package/man/h2o.naiveBayes.Rd
@@ -9,7 +9,7 @@ H2O: Naive Bayes Classifier
 \description{Builds gradient boosted classification trees, and gradient boosted regression trees on a parsed data set.
 }
 \usage{
-h2o.naiveBayes(x, y, data, laplace = 0, dropNACols = FALSE)
+h2o.naiveBayes(x, y, data, key = "", laplace = 0, dropNACols = FALSE)
 }
 \arguments{
   \item{x}{
@@ -20,6 +20,9 @@ The name of the response variable in the model.
 }
   \item{data}{
 An \code{\linkS4class{H2OParsedData}} (\code{version = 2}) object containing the variables in the model.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
 }
   \item{laplace}{
 (Optional) A positive number controlling Laplace smoothing. The default (0) disables Laplace smoothing.

diff --git a/R/h2o-package/man/h2o.pcr.Rd b/R/h2o-package/man/h2o.pcr.Rd
@@ -7,7 +7,7 @@
 Runs GLM regression on PCA results, and allows for transformation of test data to match PCA transformations of training data.  
 }
 \usage{
-h2o.pcr(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1e-05, 
+h2o.pcr(x, y, data, key = "", ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1e-05, 
   epsilon = 1e-05, tweedie.p)
 }
 
@@ -21,6 +21,9 @@ h2o.pcr(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1e-05,
   \item{data}{
     An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
   }
+    \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
+}
   \item{ncomp}{
     A number indicating the number of principal components to use in the regression model.
   }

diff --git a/R/h2o-package/man/h2o.prcomp.Rd b/R/h2o-package/man/h2o.prcomp.Rd
@@ -8,7 +8,7 @@ Principal Components Analysis
 Performs principal components analysis on the given data set.
 }
 \usage{
-h2o.prcomp(data, tol = 0, cols = "", standardize = TRUE, retx = FALSE)
+h2o.prcomp(data, tol = 0, cols = "", key = "", standardize = TRUE, retx = FALSE)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -21,6 +21,9 @@ h2o.prcomp(data, tol = 0, cols = "", standardize = TRUE, retx = FALSE)
   \item{cols}{
   (Optional) A vector of column names or indices indicating the features to perform PCA on. By default, all columns in the dataset are analyzed.
   }
+    \item{key}{
+  (Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
+}
   \item{standardize}{
   (Optional) A logical value indicating whether the variables should be shifted to be zero centered and scaled to have unit variance before the analysis takes place.
   }

diff --git a/R/h2o-package/man/h2o.randomForest.Rd b/R/h2o-package/man/h2o.randomForest.Rd
@@ -7,10 +7,10 @@ H2O: Random Forest
 Performs random forest classification on a data set.
 }
 \usage{
-h2o.randomForest(x, y, data, classification = TRUE, ntree = 50, depth = 20, 
-  mtries = -1, sample.rate = 2/3, nbins = 100, seed = -1, importance = FALSE, 
-  nfolds = 0, validation, nodesize = 1, balance.classes = FALSE, 
-  max.after.balance.size = 5)
+h2o.randomForest(x, y, data, key = "", classification = TRUE, ntree = 50, 
+  depth = 20, mtries = -1, sample.rate = 2/3, nbins = 100, seed = -1, 
+  importance = FALSE, nfolds = 0, validation, nodesize = 1, 
+  balance.classes = FALSE, max.after.balance.size = 5)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -22,6 +22,9 @@ The name or index of the response variable. If the data does not contain a heade
 }
   \item{data}{
 An \code{\linkS4class{H2OParsedData}} object containing the variables in the model.
+}
+  \item{key}{
+(Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically be generated.
 }
   \item{classification}{
 (Optional) A logical value indicating whether a classification model should be built (as opposed to regression).