Skip to content

Commit

Permalink
Merge branch 'master' into arno-deeplearning2
Browse files Browse the repository at this point in the history
Conflicts:
	src/main/java/water/api/ConfusionMatrix.java
  • Loading branch information
arnocandel committed Feb 15, 2014
2 parents 3726958 + d34165d commit 99b8e23
Show file tree
Hide file tree
Showing 137 changed files with 3,403 additions and 187,842 deletions.
72 changes: 72 additions & 0 deletions R/examples/HUDdemo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
library(h2o)
h2o.server<- h2o.init()

hud<- h2o.uploadFile(h2o.server, "../smalldata/hud.clean.csv")

#Poke around at the data, take a look at the variables
head(hud)
str(hud)
summary(hud)
nrow(hud)
ncol(hud)


# In the original data there were two interesting columns for dependent variables -
#ZSMHC: the total cost of living in the home including rent, utilities, insurance etc...
# and RENT, the amount of rent that the household paid. We want to pull the -6 entries from
# those columns (since -6 is the HUD code for NA). Because we may want to consider the cost of
# housing relative to income, we'll clean up the colum for income as well.


hud.new<- hud[(hud[,2]> -6 & hud[,6] > -6 & hud$ZINC > 0),]
str(hud.new)
summary(hud.new)
nrow(hud.new)

#Examining the distribution of the likely dependent variables:
quantile(hud.new$RENT)
summary(hud.new$RENT)
quantile(hud.new$ZSMHC)
summary(hud.new$ZSMHC)

#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value.
# If you look at a table for RENT - you can see that the data are pretty clumpy around round numbers (i.e., people are more likely to pay $700 for # rent
# than $678, or $723), and then there is a weird spike at the highest value of $5892 - about $3000 more than the second highest value. They might be #legitimate observations, but rents this high are well away from the rest of the distribution distribution, so we'll separate these highest values out for now, and consider the #rents that fall in the normal range.
as.data.frame(table(hud.new$RENT))
hud.short<- hud.new[(hud.new$RENT< 3000),]
summary(hud.short)

#Running a quick Kmeans model allows us to further characterize: (for instance, note in the cluster generated below that the rents in the upper middle group # also have much lower incidence of income from social safety nets, and lower incidence of rodents. At the highest rents level the incidents of all of these # increase again, suggesting that higher rents are not necessarily an indicator of higher quality housing)
hud.epx<- as.data.frame(hud.short)
hud.epx<- as.h2o(h2o=h2o.server, hud.epx)
hud.kmeans<- h2o.kmeans(data=hud.epx, centers = 4, cols=c("RATS", "MICE", "MOLD", "POOR", "QSS", "QSSI", "QWKCMP", "QWELF", "RENT"), normalize=F)
hud.kmeans


#Even though we pared the data down from ~900 original columns to ~70 columns, we expect that some of the columns are highly
# collinear (for example, houses that report income from social safety nets are also more likely to report incomes near or below
# the poverty line, or houses that have a washer are much more likely to also have a dryer, making one condition a reasonable predictor of the other).

hud.PCAregress<- h2o.pcr(x=c("RATS", "MICE", "MOLD", "POOR", "QSS", "QSSI", "QWKCMP", "QWELF", "ZINC", "EVROD", "EROACH", "TOILET", "TUB", "REFR", "TRASH", "DISPL", "STOVE", "AIRSYS", "ELECT", "LIVING", "KITCH", "HALFB", "FAMRM", "DINING", "DENS", "BEDRMS", "BATHS", "COOK", "OVEN", "DRY", "WASH", "DISH", "PLUMB", "KITCHEN", "PHONE", "ROOMS", "APPLY", "VCHRMOV", "VCHER", "QRETIR", "QSELF", "ZINC", "ZADULT", "PER"), y="RENT", data=hud.short, ncomp=10, family="gaussian")
hud.PCAregress

#We can also just run a standard regression on the data set, and use regularization to tune.
#Split data into test and train sets:
hud.short[,71]<- h2o.runif(hud.short)
summary(hud.short[,71])
hud.short.train<- hud.short[(hud.short[,71]<= .80),]
hud.short.test<- hud.short[(hud.short[,71]> .80),]
nrow(hud.short.train)
nrow(hud.short.test)

preds = c("REGMOR", "DIVISION", "REGION", "METRO", "STATE", "LMED", "LMEDA", "LMEDB", "FMR", "FMRA", "FMRB", "L30", "L50", "L80", "IPOV", "PER", "ZADULT", "ZINC", "ZINC2", "QSELF", "QSS", "QSSI", "QWELF", "QRETIR", "QWKCMP", "POOR", "VCHER", "VCHRMOV", "RENEW", "APPLY", "ROOMS", "PHONE", "KITCHEN", "PLUMB", "DISH", "WASH", "DRY", "OVEN", "COOK", "NUNIT2", "BATHS", "BEDRMS", "DENS", "DINING", "FAMRM", "HALFB", "KITCH", "LIVING" ,"OTHFN", "ELECT", "AIRSYS", "STOVE", "PORTH", "DISPL", "TRASH", "REFR", "TOILET", "TUB", "RATS", "MICE", "MOLD", "EROACH", "EVROD")
L = c(seq(from= 0, to = 1, by= .01))
hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .1), lambda = L, nfolds=0, data=hud.short.train)
hud.reg
hud.best<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=0, lambda =0, nfolds=0, data=hud.short.train)
hud.test<- h2o.predict(hud.best, hud.short.test)
summary(hud.test)




4 changes: 2 additions & 2 deletions R/h2o-DESCRIPTION.template
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ Date: 2013-09-13
Author: Anqi Fu
Maintainer: Anqi Fu <[email protected]>
Description: This is a wrapper that installs the package for running H2O via its REST API from within R.
License: Apache-2
Depends: R (>= 2.13.0), RCurl, rjson, statmod, tools
License: Apache-2.0
Depends: R (>= 2.13.0), RCurl, rjson, statmod, tools, utils
NeedsCompilation: no
SystemRequirements: java
URL: http://www.0xdata.com
2 changes: 2 additions & 0 deletions R/h2o-package/NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import("RCurl", "rjson", "statmod")
importFrom("utils", "installed.packages")
exportPattern("^[[:alpha:]]+")
export("h2o.shutdown", "h2o.init")
exportClasses(
Expand Down
184 changes: 73 additions & 111 deletions R/h2o-package/R/h2oWrapper.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
setClass("H2OClient", representation(ip="character", port="numeric"), prototype(ip="127.0.0.1", port=54321))

h2o.__PAGE_RPACKAGE = "RPackage.json"
h2o.__PAGE_SHUTDOWN = "Shutdown.json"
.h2o.__PAGE_RPACKAGE = "RPackage.json"
.h2o.__PAGE_SHUTDOWN = "Shutdown.json"
.h2o.__PAGE_CLOUD = "Cloud.json"

setMethod("show", "H2OClient", function(object) {
cat("IP Address:", object@ip, "\n")
Expand All @@ -21,7 +22,6 @@ h2o.init <- function(ip = "127.0.0.1", port = 54321, startH2O = TRUE, silentUpgr
if(!is.character(Xmx)) stop("Xmx must be of class character")
if(!regexpr("^[1-9][0-9]*[gGmM]$", Xmx)) stop("Xmx option must be like 1g or 1024m")

.startedH2O <<- FALSE
myURL = paste("http://", ip, ":", port, sep="")
if(!url.exists(myURL)) {
if(!startH2O)
Expand All @@ -30,53 +30,89 @@ h2o.init <- function(ip = "127.0.0.1", port = 54321, startH2O = TRUE, silentUpgr
cat("\nH2O is not running yet, starting it now...\n")
# h2oWrapper.startLauncher()
# invisible(readline("Start H2O, then hit <Return> to continue: "))
h2o.startJar(Xmx)
.h2o.startJar(Xmx)
count = 0; while(!url.exists(myURL) && count < 60) { Sys.sleep(1); count = count + 1 }
if(!url.exists(myURL)) stop("H2O failed to start, stopping execution.")
} else stop("Can only start H2O launcher if IP address is localhost")
}
cat("Successfully connected to", myURL, "\n")
h2o.checkPackage(myURL, silentUpgrade, promptUpgrade)
.h2o.checkPackage(myURL, silentUpgrade, promptUpgrade)

if("package:h2oRClient" %in% search())
detach("package:h2oRClient", unload=TRUE)
if("h2oRClient" %in% installed.packages()[,1])
library(h2oRClient)
return(new("H2OClient", ip = ip, port = port))

H2Oserver = new("H2OClient", ip = ip, port = port)
tmp = h2o.clusterStatus(H2Oserver)
cat("Cluster status:\n"); print(tmp)
return(H2Oserver)
}

# Shuts down H2O instance running at given IP and port
h2o.shutdown <- function(object, prompt = TRUE) {
if(class(object) != "H2OClient") stop("object must be of class H2OClient")
h2o.shutdown <- function(client, prompt = TRUE) {
if(class(client) != "H2OClient") stop("client must be of class H2OClient")
if(!is.logical(prompt)) stop("prompt must be of class logical")

myURL = paste("http://", object@ip, ":", object@port, sep="")
myURL = paste("http://", client@ip, ":", client@port, sep="")
if(!url.exists(myURL)) stop(paste("There is no H2O instance running at", myURL))
if(prompt) {
ans = readline(paste("Are you sure you want to shutdown the H2O instance running at", myURL, "(Y/N)? "))
temp = substr(ans, 1, 1)
} else temp = "y"
if(temp == "Y" || temp == "y") {
res = getURLContent(paste(myURL, h2o.__PAGE_SHUTDOWN, sep="/"))
res = getURLContent(paste(myURL, .h2o.__PAGE_SHUTDOWN, sep="/"))
res = fromJSON(res)
if(!is.null(res$error))
stop(paste("Unable to shutdown H2O. Server returned the following error:\n", res$error))
}
# if(url.exists(myURL)) stop("H2O failed to shutdown.")
}

# ----------------------- Diagnostics ----------------------- #
h2o.clusterStatus <- function(client) {
if(missing(client) || class(client) != "H2OClient") stop("client must be a H2OClient object")
myURL = paste("http://", client@ip, ":", client@port, "/", .h2o.__PAGE_CLOUD, sep = "")
if(!url.exists(myURL)) stop("Cannot connect to H2O instance at ", myURL)
res = fromJSON(postForm(myURL, style = "POST"))

cat("Version:", res$version, "\n")
cat("Cloud name:", res$cloud_name, "\n")
cat("Node name:", res$node_name, "\n")
cat("Cloud size:", res$cloud_size, "\n")
if(res$locked) cat("Cloud is locked\n\n") else cat("Accepting new members\n\n")
if(is.null(res$nodes) || length(res$nodes) == 0) stop("No nodes found!")

# Calculate how many seconds ago we last contacted cloud
cur_time <- Sys.time()
for(i in 1:length(res$nodes)) {
last_contact_sec = as.numeric(res$nodes[[i]]$last_contact)/1e3
time_diff = cur_time - as.POSIXct(last_contact_sec, origin = "1970-01-01")
res$nodes[[i]]$last_contact = as.numeric(time_diff)
}
cnames = c("name", "value_size_bytes", "free_mem_bytes", "max_mem_bytes", "free_disk_bytes", "max_disk_bytes", "num_cpus", "system_load", "rpcs", "last_contact")
temp = data.frame(t(sapply(res$nodes, c)))
return(temp[,cnames])
}

#-------------------------------- Helper Methods --------------------------------#
# NB: if H2OVersion matches \.99999$ is a development version, so pull package info out of file. yes this is a hack
# but it makes development versions properly prompt upgrade
h2o.checkPackage <- function(myURL, silentUpgrade, promptUpgrade) {
temp = postForm(paste(myURL, h2o.__PAGE_RPACKAGE, sep="/"), style = "POST")
.h2o.checkPackage <- function(myURL, silentUpgrade, promptUpgrade) {
h2oWrapper.__formatError <- function(error, prefix=" ") {
result = ""
items = strsplit(error,"\n")[[1]];
for (i in 1:length(items))
result = paste(result, prefix, items[i], "\n", sep="")
result
}

temp = postForm(paste(myURL, .h2o.__PAGE_RPACKAGE, sep="/"), style = "POST")
res = fromJSON(temp)
if (!is.null(res$error))
stop(paste(myURL," returned the following error:\n", h2oWrapper.__formatError(res$error)))

H2OVersion = res$version
myFile = res$filename
# serverMD5 = res$md5_hash

if( grepl('\\.99999$', H2OVersion) ){
H2OVersion <- sub('\\.tar\\.gz$', '', sub('.*_', '', myFile))
Expand All @@ -93,29 +129,21 @@ h2o.checkPackage <- function(myURL, silentUpgrade, promptUpgrade) {

if("h2oRClient" %in% myPackages && !needs_upgrade )
cat("H2O R package and server version", H2OVersion, "match\n")
else if(h2o.shouldUpgrade(silentUpgrade, promptUpgrade, H2OVersion)) {
else if(.h2o.shouldUpgrade(silentUpgrade, promptUpgrade, H2OVersion)) {
if("h2oRClient" %in% myPackages) {
cat("Removing old H2O R package version", toString(packageVersion("h2oRClient")), "\n")
if("package:h2oRClient" %in% search())
detach("package:h2oRClient", unload=TRUE)
remove.packages("h2oRClient")
}
cat("Downloading and installing H2O R package version", H2OVersion, "\n")
# download.file(paste(myURL, "R", myFile, sep="/"), destfile = paste(getwd(), myFile, sep="/"), mode = "wb")
# temp = getBinaryURL(paste(myURL, "R", myFile, sep="/"))
# writeBin(temp, paste(getwd(), myFile, sep="/"))
#
# if(as.character(serverMD5) != as.character(md5sum(paste(getwd(), myFile, sep="/"))))
# warning("Mismatched MD5 hash! Check you have downloaded complete R package.")
# install.packages(paste(getwd(), myFile, sep="/"), repos = NULL, type = "source")
# file.remove(paste(getwd(), myFile, sep="/"))
install.packages("h2oRClient", repos = c(H2O = paste(myURL, "R", sep = "/"), getOption("repos")))
}
}

# Check if user wants to install H2O R package matching version on server
# Note: silentUpgrade supercedes promptUpgrade
h2o.shouldUpgrade <- function(silentUpgrade, promptUpgrade, H2OVersion) {
.h2o.shouldUpgrade <- function(silentUpgrade, promptUpgrade, H2OVersion) {
if(silentUpgrade) return(TRUE)
if(promptUpgrade) {
ans = readline(paste("Do you want to install H2O R package version", H2OVersion, "from the server (Y/N)? "))
Expand All @@ -126,35 +154,22 @@ h2o.shouldUpgrade <- function(silentUpgrade, promptUpgrade, H2OVersion) {
} else return(FALSE)
}

h2oWrapper.__formatError <- function(error, prefix=" ") {
result = ""
items = strsplit(error,"\n")[[1]];
for (i in 1:length(items))
result = paste(result, prefix, items[i], "\n", sep="")
result
}

#---------------------------- H2O Jar Initialization -------------------------------#
.h2o.pkg.path <- NULL

.onLoad <- function(lib, pkg) {
.h2o.pkg.path <<- paste(lib, pkg, sep = .Platform$file.sep)

if(.Platform$OS.type == "unix") {
print("Checking libcurl version...")
curl_path <- Sys.which("curl-config")
if(system2(curl_path, args = "--version") != 0)
stop("libcurl not found! Please install libcurl (version 7.14.0 or higher) from http://curl.haxx.se. On Linux systems,
you will often have to explicitly install libcurl-devel to have the header files and the libcurl library.")
# installing RCurl requires curl and curl-config, which is typically separately installed
if(!("RCurl" %in% installed.packages()[,1])) {
if(.Platform$OS.type == "unix") {
packageStartupMessage("Checking libcurl version...")
curl_path <- Sys.which("curl-config")
if(curl_path[[1]] == '' || system2(curl_path, args = "--version") != 0)
stop("libcurl not found! Please install libcurl (version 7.14.0 or higher) from http://curl.haxx.se. On Linux systems,
you will often have to explicitly install libcurl-devel to have the header files and the libcurl library.")
}
}
# TODO: Not sure how to check for libcurl in Windows

# Install and load H2O R package dependencies
# require(tools)
# myPackages = rownames(installed.packages())
# myReqPkgs = c("bitops", "RCurl", "rjson", "statmod")
# temp = lapply(myReqPkgs, function(x) { if(!x %in% myPackages) { cat("Installing package dependency", x, "\n"); install.packages(x, repos = "http://cran.rstudio.com/") }
# if(!require(x, character.only = TRUE)) stop("The required package ", x, " is not installed. Please type install.packages(\"", x, "\") to install the dependency from CRAN.") })
}

.onAttach <- function(libname, pkgname) {
Expand Down Expand Up @@ -182,25 +197,31 @@ h2oWrapper.__formatError <- function(error, prefix=" ") {
packageStartupMessage(msg)

# TODO: Might need to be careful if .LastOriginal exists. Also, user can override .Last manually and break hack.
.startedH2O <<- FALSE
.LastOriginal <<- function() { return(NULL) }
if(exists(".Last", envir = .GlobalEnv)) {
.LastOriginal <<- get(".Last", envir = .GlobalEnv)
assign(".Last", function(..., envir = parent.frame()) {
ip = "127.0.0.1"; port = 54321
myURL = paste("http://", ip, ":", port, sep = "")

require(RCurl); require(rjson)
if(exists(".startedH2O") && .startedH2O && url.exists(myURL))
# require(RCurl); require(rjson)
if(exists(".startedH2O") && .startedH2O && url.exists(myURL)) {
h2o.shutdown(new("H2OClient", ip=ip, port=port), FALSE)
.startedH2O <<- FALSE
}
eval(.LastOriginal(...), envir = envir)
}, envir = .GlobalEnv)
} else {
assign(".Last", function() {
ip = "127.0.0.1"; port = 54321
myURL = paste("http://", ip, ":", port, sep = "")

require(RCurl); require(rjson)
if(exists(".startedH2O") && .startedH2O && url.exists(myURL))
# require(RCurl); require(rjson)
if(exists(".startedH2O") && .startedH2O && url.exists(myURL)) {
h2o.shutdown(new("H2OClient", ip=ip, port=port), FALSE)
.startedH2O <<- FALSE
}
}, envir = .GlobalEnv)
}
}
Expand All @@ -217,11 +238,11 @@ h2oWrapper.__formatError <- function(error, prefix=" ") {
# myURL = paste("http://", ip, ":", port, sep = "")
#
# require(RCurl); require(rjson)
# if(url.exists(myURL) && exists(".startedH2O") && .startedH2O)
# if(exists(".startedH2O") && .startedH2O && url.exists(myURL))
# h2o.shutdown(new("H2OClient", ip=ip, port=port), FALSE)
# }

h2o.startJar <- function(memory = "1g") {
.h2o.startJar <- function(memory = "1g") {
command <- Sys.which("java")
#
# TODO: tmp files should be user-independent
Expand Down Expand Up @@ -264,62 +285,3 @@ h2o.startJar <- function(memory = "1g") {
}
.startedH2O <<- TRUE
}

#---------------------------------- Deprecated ----------------------------------#
# Start H2O launcher GUI if installed locally from InstallBuilder executable
h2oWrapper.startLauncher <- function() {
myOS = Sys.info()["sysname"]

if(myOS == "Windows") verPath = paste(Sys.getenv("APPDATA"), "h2o", sep="/")
else verPath = paste(Sys.getenv("HOME"), "Library/Application Support/h2o", sep="/")
myFiles = list.files(verPath)
if(length(myFiles) == 0) stop("Cannot find location of H2O launcher. Please check that your H2O installation is complete.")
# Must trim myFiles so all have format 1.2.3.45678.txt (use regexpr)!

# Get H2O with latest version number
# If latest isn't working, maybe go down list to earliest until one executes?
fileName = paste(verPath, tail(myFiles, n=1), sep="/")
myVersion = strsplit(tail(myFiles, n=1), ".txt")[[1]]
launchPath = readChar(fileName, file.info(fileName)$size)
if(is.null(launchPath) || launchPath == "")
stop(paste("No H2O launcher matching H2O version", myVersion, "found"))

cat("Launching H2O version", myVersion)
if(myOS == "Windows") {
tempPath = paste(launchPath, "windows/h2o.bat", sep="/")
if(!file.exists(tempPath)) stop(paste("Cannot open H2OLauncher.jar! Please check if it exists at", tempPath))
shell.exec(tempPath)
}
else {
tempPath = paste(launchPath, "Contents/MacOS/h2o", sep="/")
if(!file.exists(tempPath)) stop(paste("Cannot open H2OLauncher.jar! Please check if it exists at", tempPath))
system(paste("bash ", tempPath))
}
}

h2o.__genScript <- function(target = NULL, memory = "2g") {
if(.Platform$OS.type == "windows")
run.template <- paste(.h2o.pkg.path, "scripts", "h2o.bat.TEMPLATE", sep = .Platform$file.sep)
else
run.template <- paste(.h2o.pkg.path, "scripts", "h2o.TEMPLATE", sep = .Platform$file.sep)
rt <- readLines(run.template)

settings <- c("JAVA_HOME", "JAVA_PROG", "H2O_JAR", "FLAT", "MEM")
sl <- list()
for (i in settings) sl[[i]] <- Sys.getenv(i)
if (nchar(sl[["JAVA_PROG"]]) == 0) {
if (nchar(sl[["JAVA_HOME"]]) > 0) {
jc <- paste(sl[["JAVA_HOME"]], "bin", "java", sep = .Platform$file.sep)
if (file.exists(jc))
sl[["JAVA_PROG"]] <- jc
}
else sl[["JAVA_PROG"]] <- "java"
}
sl[["H2O_JAR"]] <- system.file("java", "h2o.jar", package = "h2o")
sl[["FLAT"]] <- system.file("java", "flatfile.txt", package = "h2o")
sl[["MEM"]] <- memory

for (i in names(sl)) rt <- gsub(paste("@", i, "@", sep = ""), sl[[i]], rt)
if (is.null(target)) return(rt)
writeLines(rt, target)
}
Loading

0 comments on commit 99b8e23

Please sign in to comment.