Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
dearirenelang committed Feb 18, 2014
2 parents c48c05f + bb3084e commit d961e4e
Show file tree
Hide file tree
Showing 75 changed files with 1,339 additions and 1,099 deletions.
4 changes: 2 additions & 2 deletions R/h2o-package/R/h2oWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ h2o.init <- function(ip = "127.0.0.1", port = 54321, startH2O = TRUE, silentUpgr
library(h2oRClient)

H2Oserver = new("H2OClient", ip = ip, port = port)
tmp = h2o.clusterStatus(H2Oserver)
cat("Cluster status:\n"); print(tmp)
h2o.clusterInfo(H2Oserver)
cat("\n")
return(H2Oserver)
}

Expand Down
11 changes: 1 addition & 10 deletions R/h2o-package/man/h2o.init.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,4 @@ localH2O = h2o.init(ip = "localhost", port = 54321, silentUpgrade = FALSE, promp
# Automatically install H2O R package from server if version mismatch
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = FALSE, silentUpgrade = TRUE, promptUpgrade = FALSE)
# Uninstall and maunually upgrade H2O for R (after unloading)
detach("package:h2oRClient", unload=TRUE)
detach("package:h2o", unload=TRUE)
remove.packages("h2oRClient")
remove.packages("h2o")
install.packages("~/Work/h2o/target/R/h2o_2.1.0.99999.tar.gz", repos = NULL, type = "source")
library(h2o)
}
}
26 changes: 19 additions & 7 deletions R/h2oRClient-package/R/Algorithms.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Model-building operations and algorithms
# ----------------------- Generalized Boosting Machines (GBM) ----------------------- #
# TODO: don't support missing x; default to everything?
h2o.gbm <- function(x, y, distribution='multinomial', data, n.trees=10, interaction.depth=5, n.minobsinnode=10, shrinkage=0.02, n.bins=100, validation) {
h2o.gbm <- function(x, y, distribution='multinomial', data, n.trees=10, interaction.depth=5, n.minobsinnode=10, shrinkage=0.1, n.bins=100, validation) {
args <- .verify_dataxy(data, x, y)

if(!is.numeric(n.trees)) stop('n.trees must be numeric')
Expand Down Expand Up @@ -460,6 +460,7 @@ h2o.kmeans.FV <- function(data, centers, cols='', iter.max=10, normalize = FALSE
if( missing(data) ) stop('Must specify data')
# if(class(data) != 'H2OParsedData' ) stop('data must be an h2o dataset')
if(!class(data) %in% c("H2OParsedData", "H2OParsedDataVA")) stop("data must be an H2O parsed dataset")
if(h2o.anyFactor(data)) stop("Unimplemented: K-means can only model on numeric data")

if( missing(centers) ) stop('must specify centers')
if(!is.numeric(centers) && !is.integer(centers)) stop('centers must be a positive integer')
Expand Down Expand Up @@ -621,15 +622,27 @@ h2o.nn <- function(x, y, data, classification=T, activation='Tanh', layers=500,
}

# ----------------------- Principal Components Analysis ----------------------------- #
h2o.prcomp <- function(data, tol=0, ignored_cols = '', standardize=TRUE, retx=FALSE) {
if( missing(data) ) stop('Must specify data')
h2o.prcomp <- function(data, tol=0, ignored_cols = "", standardize=TRUE, retx=FALSE) {
if(missing(data)) stop('Must specify data')
# if(class(data) != "H2OParsedData") stop('data must be an H2O FluidVec dataset')
if(!class(data) %in% c("H2OParsedData", "H2OParsedDataVA")) stop("data must be an H2O parsed dataset")
if(!is.numeric(tol)) stop('tol must be numeric')
if(!is.character(ignored_cols) && !is.numeric(ignored_cols))
stop("ignored_cols must be either a character or numeric vector")
if(!is.logical(standardize)) stop('standardize must be TRUE or FALSE')
if(!is.logical(retx)) stop('retx must be TRUE or FALSE')

destKey = .h2o.__uniqID("PCAModel")
cc <- colnames(data)
if(is.character(ignored_cols)) {
if(ignored_cols[1] != "" && any(!(ignored_cols %in% cc)))
stop(paste(paste(ignored_cols[!(ignored_cols %in% cc)], collapse=','), 'is not a valid column name'))
} else {
if(any(ignored_cols < 1 | ignored_cols > length(cc)))
stop(paste('Out of range explanatory variable', paste(ignored_cols[ignored_cols < 1 | ignored_cols > length(cc)], collapse=',')))
ignored_cols <- cc[ignored_cols]
}

res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_PCA, source=data@key, destination_key=destKey, ignored_cols = ignored_cols, tolerance=tol, standardize=as.numeric(standardize))
.h2o.__waitOnJob(data@h2o, res$job_key)
# while(!.h2o.__isDone(data@h2o, "PCA", res)) { Sys.sleep(1) }
Expand All @@ -650,8 +663,7 @@ h2o.prcomp <- function(data, tol=0, ignored_cols = '', standardize=TRUE, retx=FA
new("H2OPCAModel", key=destKey, data=data, model=result)
}

# setGeneric("h2o.pcr", function(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1.0e-5, tweedie.p = ifelse(family=="tweedie", 0, NA)) { standardGeneric("h2o.pcr") })
h2o.pcr <- function(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1.0e-5, epsilon = 1.0e-5, standardize = TRUE, tweedie.p = ifelse(family=="tweedie", 0, as.numeric(NA))) {
h2o.pcr <- function(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda = 1.0e-5, epsilon = 1.0e-5, tweedie.p = ifelse(family=="tweedie", 0, as.numeric(NA))) {
args <- .verify_dataxy(data, x, y)

if( !is.numeric(nfolds) ) stop('nfolds must be numeric')
Expand All @@ -672,7 +684,7 @@ h2o.pcr <- function(x, y, data, ncomp, family, nfolds = 10, alpha = 0.5, lambda

myScore[,ncomp+1] = data[,args$y_i] # Bind response to frame of principal components
myGLMData = new("H2OParsedData", h2o=data@h2o, key=myScore@key)
h2o.glm.FV(1:ncomp, ncomp+1, myGLMData, family, nfolds, alpha, lambda, epsilon, standardize, tweedie.p)
h2o.glm.FV(1:ncomp, ncomp+1, myGLMData, family, nfolds, alpha, lambda, epsilon, standardize = FALSE, tweedie.p)
}

.h2o.prcomp.internal <- function(data, x_ignore, dest, max_pc=10000, tol=0, standardize=TRUE) {
Expand Down Expand Up @@ -916,7 +928,7 @@ h2o.confusionMatrix <- function(data, reference) {
y_i <- y
y <- cc[ y ]
}
if( y %in% x ) stop(y, 'is both an explanatory and dependent variable')
if( y %in% x ) stop(paste(y, 'is both an explanatory and dependent variable'))

x_ignore <- setdiff(setdiff( cc, x ), y)
if( length(x_ignore) == 0 ) x_ignore <- ''
Expand Down
23 changes: 13 additions & 10 deletions R/h2oRClient-package/R/Classes.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
MAX_INSPECT_VIEW = 10000
.MAX_INSPECT_VIEW = 10000

# Class definitions
# WARNING: Do NOT touch the env slot! It is used to link garbage collection between R and H2O
Expand Down Expand Up @@ -551,7 +551,8 @@ h2o.runif <- function(x, min = 0, max = 1) {
return(new("H2OParsedData", h2o=x@h2o, key=res$dest_key, logic=FALSE))
}

setMethod("colnames", "H2OParsedData", function(x) {
setMethod("colnames", "H2OParsedData", function(x, do.NULL = TRUE, prefix = "col") {
if(!do.NULL) stop("Unimplemented: Auto-generated colnames are C1, C2, ...")
res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT2, src_key=x@key)
unlist(lapply(res$cols, function(y) y$name))
})
Expand Down Expand Up @@ -652,12 +653,14 @@ setMethod("range", "H2OParsedData", function(x) {
c(min(temp[1,]), max(temp[2,]))
})

setMethod("colMeans", "H2OParsedData", function(x) {
res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT2, src_key=x@key)
temp = sapply(res$cols, function(x) { x$mean })
names(temp) = sapply(res$cols, function(x) { x$name })
temp
})
# setMethod("colMeans", "H2OParsedData", function(x, na.rm = FALSE, dims = 1) {
# if(dims != 1) stop("Unimplemented")
# if(!na.rm && .h2o.__unop2("any.na", x)) return(NA)
# res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT2, src_key=x@key)
# temp = sapply(res$cols, function(x) { x$mean })
# names(temp) = sapply(res$cols, function(x) { x$name })
# temp
# })

mean.H2OParsedData <- function(x, trim = 0, na.rm = FALSE, ...) {
if(length(x) != 1 || trim != 0) stop("Unimplemented")
Expand Down Expand Up @@ -1043,7 +1046,7 @@ setMethod("head", "H2OParsedDataVA", function(x, n = 6L, ...) {
stopifnot(length(n) == 1L)
n <- ifelse(n < 0L, max(numRows + n, 0L), min(n, numRows))
if(n == 0) return(data.frame())
if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
if(n > .MAX_INSPECT_VIEW) stop(paste("Cannot view more than", .MAX_INSPECT_VIEW, "rows"))

res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT, key=x@key, offset=0, view=n)
blanks = sapply(res$cols, function(y) { nchar(y$name) == 0 }) # Must stop R from auto-renaming cols with no name
Expand All @@ -1064,7 +1067,7 @@ setMethod("tail", "H2OParsedDataVA", function(x, n = 6L, ...) {
nrx <- nrow(x)
n <- ifelse(n < 0L, max(nrx + n, 0L), min(n, nrx))
if(n == 0) return(data.frame())
if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
if(n > .MAX_INSPECT_VIEW) stop(paste("Cannot view more than", .MAX_INSPECT_VIEW, "rows"))

idx = seq.int(to = nrx, length.out = n)
res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT, key=x@key, offset=idx[1], view=length(idx))
Expand Down
38 changes: 19 additions & 19 deletions R/h2oRClient-package/R/Internal.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Hack to get around Exec.json always dumping to same Result.hex key
# TODO: Need better way to manage temporary/intermediate values in calculations! Right now, overwriting occurs silently
pkg.env = new.env()
pkg.env$result_count = 0
pkg.env$temp_count = 0
pkg.env$IS_LOGGING = FALSE
.pkg.env = new.env()
.pkg.env$result_count = 0
.pkg.env$temp_count = 0
.pkg.env$IS_LOGGING = FALSE
.TEMP_KEY = "Last.value"
.RESULT_MAX = 200
.LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&", "|", "&&", "||", "!", "is.na")
Expand All @@ -13,23 +13,23 @@ pkg.env$IS_LOGGING = FALSE
if(.Platform$OS.type == "windows")
.myPath = paste(Sys.getenv("APPDATA"), "h2o", sep=.Platform$file.sep)

pkg.env$h2o.__LOG_COMMAND = paste(.myPath, "h2o_commands.log", sep=.Platform$file.sep)
pkg.env$h2o.__LOG_ERROR = paste(.myPath, "h2o_error_json.log", sep=.Platform$file.sep)
.pkg.env$h2o.__LOG_COMMAND = paste(.myPath, "h2o_commands.log", sep=.Platform$file.sep)
.pkg.env$h2o.__LOG_ERROR = paste(.myPath, "h2o_error_json.log", sep=.Platform$file.sep)

h2o.__startLogging <- function() { assign("IS_LOGGING", TRUE, envir = pkg.env) }
h2o.__stopLogging <- function() { assign("IS_LOGGING", FALSE, envir = pkg.env) }
h2o.__clearLogs <- function() { unlink(pkg.env$.h2o.__LOG_COMMAND)
unlink(pkg.env$.h2o.__LOG_ERROR) }
h2o.__startLogging <- function() { assign("IS_LOGGING", TRUE, envir = .pkg.env) }
h2o.__stopLogging <- function() { assign("IS_LOGGING", FALSE, envir = .pkg.env) }
h2o.__clearLogs <- function() { unlink(.pkg.env$.h2o.__LOG_COMMAND)
unlink(.pkg.env$.h2o.__LOG_ERROR) }
h2o.__getLog <- function(type) {
if(missing(type) || !type %in% c("Command", "Error"))
stop("type must be either 'Command' or 'Error'")
switch(type, Command = pkg.env$h2o.__LOG_COMMAND, Error = pkg.env$h2o.__LOG_ERROR)
switch(type, Command = .pkg.env$h2o.__LOG_COMMAND, Error = .pkg.env$h2o.__LOG_ERROR)
}

h2o.__openLog <- function(type) {
if(missing(type) || !type %in% c("Command", "Error"))
stop("type must be either 'Command' or 'Error'")
myFile = switch(type, Command = pkg.env$h2o.__LOG_COMMAND, Error = pkg.env$h2o.__LOG_ERROR)
myFile = switch(type, Command = .pkg.env$h2o.__LOG_COMMAND, Error = .pkg.env$h2o.__LOG_ERROR)

myOS = Sys.info()["sysname"]
if(myOS == "Windows") shell.exec(paste("open '", myFile, "'", sep=""))
Expand All @@ -42,7 +42,7 @@ h2o.__changeLog <- function(path, type) {
myVar = switch(type, Command = "h2o.__LOG_COMMAND", Error = "h2o.__LOG_ERROR")
myFile = switch(type, Command = "commands.log", Error = "errors.log")
cmd <- paste(path, myFile, sep = .Platform$file.sep)
assign(myVar, cmd, envir = pkg.env)
assign(myVar, cmd, envir = .pkg.env)
}

.h2o.__logIt <- function(m, tmp, commandOrErr, isPost = TRUE) {
Expand All @@ -64,7 +64,7 @@ h2o.__changeLog <- function(path, type) {
if(commandOrErr == "Command")
h <- paste(h, ifelse(isPost, "POST", "GET"), sep = "\n")
s <- paste(h, "\n", s)
write(s, file = ifelse(commandOrErr == "Command", pkg.env$h2o.__LOG_COMMAND, pkg.env$h2o.__LOG_ERROR), append = TRUE)
write(s, file = ifelse(commandOrErr == "Command", .pkg.env$h2o.__LOG_COMMAND, .pkg.env$h2o.__LOG_ERROR), append = TRUE)
}

# Internal functions & declarations
Expand Down Expand Up @@ -144,7 +144,7 @@ h2o.__changeLog <- function(path, type) {
# Re-enable POST since we found the bug in NanoHTTPD which was causing POST
# payloads to be dropped.
#
if(pkg.env$IS_LOGGING) {
if(.pkg.env$IS_LOGGING) {
# Log list of parameters sent to H2O
.h2o.__logIt(myURL, list(...), "Command")

Expand All @@ -157,7 +157,7 @@ h2o.__changeLog <- function(path, type) {
hh <- hg$value()
s <- paste(hh["Date"], "\nHTTP status code: ", hh["status"], "\n ", temp, sep = "")
s <- paste(s, "\n\n------------------------------------------------------------------\n")
write(s, file = pkg.env$h2o.__LOG_COMMAND, append = TRUE)
write(s, file = .pkg.env$h2o.__LOG_COMMAND, append = TRUE)
} else
temp = postForm(myURL, style = "POST", ...)

Expand All @@ -177,7 +177,7 @@ h2o.__changeLog <- function(path, type) {
res = fromJSON(after)

if (!is.null(res$error)) {
if(pkg.env$IS_LOGGING) .h2o.__writeToFile(res, pkg.env$h2o.__LOG_ERROR)
if(.pkg.env$IS_LOGGING) .h2o.__writeToFile(res, .pkg.env$h2o.__LOG_ERROR)
stop(paste(myURL," returned the following error:\n", .h2o.__formatError(res$error)))
}
res
Expand Down Expand Up @@ -318,8 +318,8 @@ h2o.__changeLog <- function(path, type) {

#------------------------------------ Exec2 ------------------------------------#
.h2o.__exec2 <- function(client, expr) {
destKey = paste(.TEMP_KEY, ".", pkg.env$temp_count, sep="")
pkg.env$temp_count = (pkg.env$temp_count + 1) %% .RESULT_MAX
destKey = paste(.TEMP_KEY, ".", .pkg.env$temp_count, sep="")
.pkg.env$temp_count = (.pkg.env$temp_count + 1) %% .RESULT_MAX
.h2o.__exec2_dest_key(client, expr, destKey)
# .h2o.__exec2_dest_key(client, expr, .TEMP_KEY)
}
Expand Down
54 changes: 43 additions & 11 deletions R/h2oRClient-package/R/ParseImport.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,42 @@
# Unique methods to H2O
# H2O client management operations

.readableTime<-function(epochTimeMillis) {
days=epochTimeMillis/(24*60*60*1000)
hours=(days-trunc(days))*24
minutes=(hours-trunc(hours))*60
seconds=(minutes-trunc(minutes))*60
milliseconds=(seconds-trunc(seconds))*1000
durationVector=trunc(c(days,hours,minutes,seconds,milliseconds))
names(durationVector)=c("days","hours","minutes","seconds","milliseconds")
if(length(durationVector[durationVector>0])>1)
{showVec<-durationVector[durationVector>0][1:2]} else {showVec<-durationVector[durationVector>0]}
x1=as.numeric(showVec)
x2=names(showVec)
return(paste(x1,x2))
}

h2o.clusterInfo <- function(client) {
if(missing(client) || class(client) != "H2OClient") stop("client must be a H2OClient object")
myURL = paste("http://", client@ip, ":", client@port, "/", .h2o.__PAGE_CLOUD, sep = "")
if(!url.exists(myURL)) stop("Cannot connect to H2O instance at ", myURL)
res = fromJSON(postForm(myURL, style = "POST"))

nodeInfo = res$nodes
maxMem = sum(sapply(nodeInfo,function(x) as.numeric(x['max_mem_bytes']))) / (1024 * 1024 * 1024)
numCPU = sum(sapply(nodeInfo,function(x) as.numeric(x['num_cpus'])))
clusterHealth = all(sapply(nodeInfo,function(x) as.logical(x['num_cpus']))==TRUE)

cat("R is connected to H2O cluster:\n")
cat(" H2O cluster uptime: ", .readableTime(as.numeric(res$cloud_uptime_millis)), "\n")
cat(" H2O cluster version: ", res$version, "\n")
cat(" H2O cluster name: ", res$cloud_name, "\n")
cat(" H2O cluster total nodes: ", res$cloud_size, "\n")
cat(" H2O cluster total memory: ", sprintf("%.2f GB", maxMem), "\n")
cat(" H2O cluster total cores: ", numCPU, "\n")
cat(" H2O cluster healthy: ", clusterHealth, "\n")
}

h2o.startLauncher <- function() {
myOS = Sys.info()["sysname"]

Expand Down Expand Up @@ -163,16 +200,11 @@ h2o.importFile <- function(object, path, key = "", parse = TRUE, header, sep = "
}

h2o.importFile.VA <- function(object, path, key = "", parse = TRUE, header, sep = "", col.names) {
if(class(object) != "H2OClient") stop("object must be of class H2OClient")
if(!is.character(path)) stop("path must be of class character")
if(nchar(path) == 0) stop("path must be a non-empty string")
if(!is.character(key)) stop("key must be of class character")
if(!is.logical(parse)) stop("parse must be of class logical")

if(missing(key) || nchar(key) == 0)
h2o.importFolder.VA(object, path, pattern = "", key = "", parse, header, sep, col.names = col.names)
else
h2o.importURL.VA(object, paste("file:///", path, sep=""), key, parse, header, sep, col.names = col.names)
h2o.importFolder.VA(object, path, pattern = "", key, parse, header, sep, col.names)
# if(missing(key) || nchar(key) == 0)
# h2o.importFolder.VA(object, path, pattern = "", key = "", parse, header, sep, col.names = col.names)
# else
# h2o.importURL.VA(object, paste("file:///", path, sep=""), key, parse, header, sep, col.names = col.names)
}

h2o.importFile.FV <- function(object, path, key = "", parse = TRUE, header, sep = "", col.names) {
Expand Down Expand Up @@ -459,4 +491,4 @@ h2o.downloadAllLogs <- function(client, dir_name = ".", file_name = NULL) {
cat("Writing H2O logs to", myPath, "\n")
# download.file(url, destfile = myPath)
writeBin(tempfile, myPath)
}
}
Empty file.
Loading

0 comments on commit d961e4e

Please sign in to comment.