Skip to content

Commit

Permalink
Merge commit '7eb7023673a8547e6bed64af808a4f4b72a563de' into brandon-…
Browse files Browse the repository at this point in the history
…fixes

* commit '7eb7023673a8547e6bed64af808a4f4b72a563de':
  push a beta_constraints test for tomas
  add beta_constraints to smalldata
  fix beta_constraints name
  h2o.saveAll edited to handle duplicate/xval models
  Updated PDF
  for discussion in https://0xdata.atlassian.net/browse/HEX-1814 gets a write lock exception. Can look at the test and sandbox/commands.log and decide if  you think the h2o architecture is doing what it's designed to do, or if this is a temporal state that might go away in time, or the test is not-supported user behavior. depends on your point of view, I think?
  add Tableau+R demo test
  Update Tableau demo notebook for h2o.ddply instead of ddply.
  clean up of commented out code
  R Vignette edited for structure and R code
  save model check for null xvalmodels in GLM first, fixes null pointer exception
  • Loading branch information
bghill committed Dec 5, 2014
2 parents bac566f + 7eb7023 commit e2d92d7
Show file tree
Hide file tree
Showing 10 changed files with 221 additions and 67 deletions.
2 changes: 1 addition & 1 deletion R/h2o-package/R/Algorithms.R
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ h2o.glm <- function(x, y, data, key = "",
else if (family == "tweedie")
params <- c(params, list(tweedie_variance_power = tweedie.p))

if (!is.null(beta_constraints)) params <- c(params, list(beta_constraints@key))
if (!is.null(beta_constraints)) params <- c(params, list(beta_constraints = beta_constraints@key))

res <- do.call(.h2o.__remoteSend, params)
.h2o.__waitOnJob(data@h2o, res$job_key)
Expand Down
4 changes: 3 additions & 1 deletion R/h2o-package/R/ParseImport.R
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,8 @@ h2o.saveModel <- function(object, dir="", name="",save_cv=TRUE, force=FALSE) {
h2o.saveAll <- function(object, dir="", save_cv=TRUE, force=FALSE) {
if(missing(object)) stop('Must specify object')
if(class(object) != 'H2OClient') stop('object must be of class H2OClient')
if(!is.logical(save_cv)) stop('save_cv needs to be a boolean')
if(!is.logical(force)) stop('force needs to be a boolean')

## Grab all the model keys in H2O
res = .h2o.__remoteSend(client = object, page = .h2o.__PAGE_ALLMODELS)
Expand All @@ -483,7 +485,7 @@ h2o.saveAll <- function(object, dir="", save_cv=TRUE, force=FALSE) {
for(key in keys) { dups = grep(pattern = paste(key, "_", sep = ""), x = keys)
duplicates = append(x = duplicates, values = dups)
}
keys = keys[-duplicates]
keys = if(length(duplicates) > 0) keys[-duplicates] else keys

## Create H2OModel objects in R (To grab the cross validation models)
models = lapply(keys, function(model_key) h2o.getModel(h2o = object, key = model_key))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')

test.GLM.prostate <- function(conn) {
Log.info("Importing prostate.csv data...\n")
prostate.hex <- h2o.importFile(conn, "https://raw.github.com/0xdata/h2o/master/smalldata/logreg/prostate.csv", "prostate.hex")
beta_constraints <- h2o.importFile(conn, "https://raw.github.com/0xdata/h2o/master/smalldata/beta_constraints.csv")
glm <- h2o.glm(x = 3:9, y = 2, data = prostate.hex, family = "binomial", beta_constraints = beta_constraints)
testEnd()
}

doTest("GLM Test: Prostate", test.GLM.prostate)

68 changes: 68 additions & 0 deletions R/tests/testdir_demos/runit_demo_tableau.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
##
# Test out the h2o.gbm R demo
# It imports a dataset, parses it, and prints a summary
# Then, it runs h2o.gbm on a subset of the dataset
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

test.tableau <- function(conn) {
Log.info ('Check cluster status')
h2o.clusterInfo(conn)
Log.info ('Importing data into H2O')
data.hex = h2o.importFile(conn, normalizePath(locate('smalldata/airlines/allyears2k_headers.zip')))

Log.info ('Grouping flights by months...')
numFlights = h2o.ddply(data.hex, 'Month', nrow)
numFlights.R = as.data.frame(numFlights)

Log.info ('Grouping number of cancellations by months...')
fun2 = function(df) {sum(df$Cancelled)}
h2o.addFunction(conn, fun2)
cancelledFlights = h2o.ddply(data.hex, 'Month', fun2)
cancelledFlights.R = as.data.frame(cancelledFlights)

Log.info ('Grouping flights by airport...')
originFlights = h2o.ddply(data.hex, 'Origin', nrow)
originFlights.R = as.data.frame(originFlights)

Log.info ('Grouping number of cancellations by airport...')
origin_cancelled = h2o.ddply(data.hex, 'Origin', fun2)
origin_cancelled.R = as.data.frame(origin_cancelled)

.arg2 = 'Origin,Dest,UniqueCarrier'
xvars = unlist( strsplit( .arg2, split = ',' , fixed = TRUE ) )
data.glm = h2o.glm(x = xvars , y = 'Cancelled', data = data.hex, family = 'binomial', nfolds = 0, standardize=TRUE)

glmModelTemp = eval(parse(text = 'data.glm' ))
originFactors = levels(data.hex$Origin)
## Tableau grab coefficients corresponding to predictor variable
.arg1 = originFactors
tableau_catFormat <- function( modelKey , variableStr, predictorVariable) {
if( typeof(modelKey) != 'S4') print('Model Key is not in expected format of S4')
if( is.character(variableStr) != TRUE) print('Input column is not in expected format of string')
if( is.character(predictorVariable) != TRUE) print('Input variables is not in expected format of string')
glmModelTemp = modelKey
modelCoeff = modelKey@model$coefficients
modelCoeff = modelKey@model$coefficients
idx = grep( variableStr , names(modelCoeff))
modelCoeff2 = modelCoeff[idx]
variableNames = unlist(strsplit(names(modelCoeff2),split='.',fixed=TRUE))
variableNamesMatrix = matrix(variableNames, ncol=2, byrow=TRUE)
variableList = variableNamesMatrix[,2]
names(modelCoeff2) = variableList
setDiff = setdiff(.arg1,variableList)
nullVec = rep(0,length(setDiff))
names(nullVec) = setDiff
newCoefficientList = c(modelCoeff2, nullVec)
tableau_input = newCoefficientList[predictorVariable]
tableau_input}

Log.info ('Finish setting up for Tableau function')
sapply(originFactors, function(factor) tableau_catFormat( glmModelTemp, 'Origin' , factor) )

testEnd()
}

doTest("Test out the script used in tableau worksheet", test.tableau)
Loading

0 comments on commit e2d92d7

Please sign in to comment.