Skip to content

Commit

Permalink
Add unit tests for glm link functions; compare performance to R
Browse files Browse the repository at this point in the history
  • Loading branch information
raoariel committed Aug 14, 2014
1 parent d1eaa1a commit 538bea0
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 0 deletions.
63 changes: 63 additions & 0 deletions R/tests/testdir_algos/glm/runit_GLM_link_functions_binomial.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
##
# Comparison of H2O to R with varying link functions for the BINOMIAL family on prostate dataset
# Link functions: logit (canonical link)
# log
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')


test.linkFunctions <- function(conn) {

print("Read in prostate data.")
h2o.data = h2o.uploadFile(conn, locate("smalldata/prostate/prostate_complete.csv.zip"), key="h2o.data")
R.data = as.data.frame(as.matrix(h2o.data))

print("Testing for family: BINOMIAL")
print("Set variables for h2o.")
myY = "CAPSULE"
myX = c("AGE","RACE","DCAPS","PSA","VOL","DPROS","GLEASON")
print("Define formula for R")
R.formula = (R.data[,"CAPSULE"]~.)

print("Create models with canonical link: LOGIT")
model.h2o.binomial.logit <- h2o.glm(x=myX, y=myY, data=h2o.data, family="binomial", link="logit",alpha=0.5, lambda=0, nfolds=0)
model.R.binomial.logit <- glm(formula=R.formula, data=R.data[,3:9], family=binomial(link=logit), na.action=na.omit)

print("Compare model deviances for link function logit")
deviance.h2o.logit = model.h2o.binomial.logit@model$deviance / model.h2o.binomial.logit@model$null
deviance.R.logit = deviance(model.R.binomial.logit) / model.h2o.binomial.logit@model$null
difference = deviance.R.logit - deviance.h2o.logit
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.logit))
print(cat("Deviance in R: ", deviance.R.logit))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

print("|=======================================================|")
print("WARNING: Additonal log functions in tests skipped over")
print("|=======================================================|")

#Issue with unspecified start values:

# print("Create models with link: LOG")
# model.h2o.binomial.log <- h2o.glm(x=myX, y=myY, data=h2o.data, family="binomial", link="log",alpha=0.5, lambda=0, nfolds=0)
# model.R.binomial.log <- glm(formula=R.formula, data=R.data[,3:9], family=binomial(link=log), na.action=na.omit)

# print("Compare model deviances for link function log")
# deviance.h2o.log = model.h2o.binomial.log@model$deviance / model.h2o.binomial.log@model$null
# deviance.R.log = deviance(model.R.binomial.log) / model.h2o.binomial.log@model$null
# difference = deviance.R.log - deviance.h2o.log
# if (difference > 0.01) {
# print(cat("Deviance in H2O: ", deviance.h2o.log))
# print(cat("Deviance in R: ", deviance.R.log))
# checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
# }

testEnd()
}

doTest("Comparison of H2O to R with varying link functions for the BINOMIAL family", test.linkFunctions)


72 changes: 72 additions & 0 deletions R/tests/testdir_algos/glm/runit_GLM_link_functions_gamma.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
##
# Comparison of H2O to R with varying link functions for the GAMMA family on prostate dataset
# Link functions: inverse (canonical link)
# log
# identity
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')


test.linkFunctions <- function(conn) {

print("Read in prostate data.")
h2o.data = h2o.uploadFile(conn, locate("smalldata/prostate/prostate_complete.csv.zip"), key="h2o.data")
R.data = as.data.frame(as.matrix(h2o.data))

print("Testing for family: GAMMA")
print("Set variables for h2o.")
myY = "DPROS"
myX = c("ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","GLEASON")
print("Define formula for R")
R.formula = (R.data[,"DPROS"]~.)

print("Create models with canonical link: INVERSE")
model.h2o.gamma.inverse <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gamma", link="inverse",alpha=0.5, lambda=0, nfolds=0)
model.R.gamma.inverse <- glm(formula=R.formula, data=R.data[,c(1:4,6:9)], family=Gamma(link=inverse), na.action=na.omit)

print("Compare model deviances for link function inverse")
deviance.h2o.inverse = model.h2o.gamma.inverse@model$deviance / model.h2o.gamma.inverse@model$null
deviance.R.inverse = deviance(model.R.gamma.inverse) / model.h2o.gamma.inverse@model$null
difference = deviance.R.inverse - deviance.h2o.inverse
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.inverse))
print(cat("Deviance in R: ", deviance.R.inverse))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

print("Create models with link function: LOG")
model.h2o.gamma.log <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gamma", link="log",alpha=0.5, lambda=0, nfolds=0)
model.R.gamma.log <- glm(formula=R.formula, data=R.data[,c(1:4,6:9)], family=Gamma(link=log), na.action=na.omit)

print("Compare model deviances for link function log")
deviance.h2o.log = model.h2o.gamma.log@model$deviance / model.h2o.gamma.log@model$null
deviance.R.log = deviance(model.R.gamma.log) / model.h2o.gamma.log@model$null
difference = deviance.R.log - deviance.h2o.log
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.log))
print(cat("Deviance in R: ", deviance.R.log))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

print("Create models with link: IDENTITY")
model.h2o.gamma.identity <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gamma", link="identity",alpha=0.5, lambda=0, nfolds=0)
model.R.gamma.identity <- glm(formula=R.formula, data=R.data[,c(1:4,6:9)], family=Gamma(link=identity), na.action=na.omit)

print("Compare model deviances for link function identity")
deviance.h2o.identity = model.h2o.gamma.identity@model$deviance / model.h2o.gamma.identity@model$null
deviance.R.identity = deviance(model.R.gamma.identity) / model.h2o.gamma.identity@model$null
difference = deviance.R.identity - deviance.h2o.identity
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.identity))
print(cat("Deviance in R: ", deviance.R.identity))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

testEnd()
}

doTest("Comparison of H2O to R with varying link functions for the GAMMA family", test.linkFunctions)


81 changes: 81 additions & 0 deletions R/tests/testdir_algos/glm/runit_GLM_link_functions_gaussian.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
##
# Comparison of H2O to R with varying link functions for the GAUSSIAN family on prostate dataset
# Link functions: identity (canonical link)
# log
# inverse
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')


test.linkFunctions <- function(conn) {

print("Read in prostate data.")
h2o.data = h2o.uploadFile(conn, locate("smalldata/prostate/prostate_complete.csv.zip"), key="h2o.data")
R.data = as.data.frame(as.matrix(h2o.data))

print("Testing for family: GAUSSIAN")
print("Set variables for h2o.")
myY = "GLEASON"
myX = c("ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS")
print("Define formula for R")
R.formula = (R.data[,"GLEASON"]~.)

print("Create models with canonical link: IDENTITY")
model.h2o.gaussian.identity <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gaussian", link="identity",alpha=0.5, lambda=0, nfolds=0)
model.R.gaussian.identity <- glm(formula=R.formula, data=R.data[,1:8], family=gaussian(link=identity), na.action=na.omit)

print("Compare model deviances for link function identity")
deviance.h2o.identity = model.h2o.gaussian.identity@model$deviance / model.h2o.gaussian.identity@model$null
deviance.R.identity = deviance(model.R.gaussian.identity) / model.h2o.gaussian.identity@model$null
difference = deviance.R.identity - deviance.h2o.identity
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.identity))
print(cat("Deviance in R: ", deviance.R.identity))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

print("|=======================================================|")
print("WARNING: Additonal log functions in tests skipped over")
print("|=======================================================|")

#Issue with unspecified start values:

# print("Create models with link: LOG")
# model.h2o.gaussian.log <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gaussian", link="log",alpha=0.5, lambda=0, nfolds=0)
# model.R.gaussian.log <- glm(formula=R.formula, data=R.data[,1:8], family=gaussian(link=log), na.action=na.omit)

# print("Compare model deviances for link function log")
# deviance.h2o.log = model.h2o.gaussian.log@model$deviance / model.h2o.gaussian.log@model$null
# deviance.R.log = deviance(model.R.gaussian.log) / model.h2o.gaussian.log@model$null
# difference = deviance.R.log - deviance.h2o.log
# if (difference > 0.01) {
# print(cat("Deviance in H2O: ", deviance.h2o.log))
# print(cat("Deviance in R: ", deviance.R.log))
# checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
# }


#Issue with non positive values:

# print("Create models with link: INVERSE")
# model.h2o.gaussian.inverse <- h2o.glm(x=myX, y=myY, data=h2o.data, family="gaussian", link="inverse",alpha=0.5, lambda=0, nfolds=0)
# model.R.gaussian.inverse <- glm(formula=R.formula, data=R.data[,1:8], family=gaussian(link=inverse), na.action=na.omit)

# print("Compare model deviances for link function inverse")
# deviance.h2o.inverse = model.h2o.gaussian.inverse@model$deviance / model.h2o.gaussian.inverse@model$null
# deviance.R.inverse = deviance(model.R.gaussian.inverse) / model.h2o.gaussian.inverse@model$null
# difference = deviance.R.inverse - deviance.h2o.inverse
# if (difference > 0.01) {
# print(cat("Deviance in H2O: ", deviance.h2o.inverse))
# print(cat("Deviance in R: ", deviance.R.inverse))
# checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
# }

testEnd()
}

doTest("Comparison of H2O to R with varying link functions for the GAUSSIAN family", test.linkFunctions)


57 changes: 57 additions & 0 deletions R/tests/testdir_algos/glm/runit_GLM_link_functions_poisson.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
##
# Comparison of H2O to R with varying link functions for the POISSON family on prostate dataset
# Link functions: log (canonical link)
# identity
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')


test.linkFunctions <- function(conn) {

print("Read in prostate data.")
h2o.data = h2o.uploadFile(conn, locate("smalldata/prostate/prostate_complete.csv.zip"), key="h2o.data")
R.data = as.data.frame(as.matrix(h2o.data))

print("Testing for family: POISSON")
print("Set variables for h2o.")
myY = "GLEASON"
myX = c("ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS")
print("Define formula for R")
R.formula = (R.data[,"GLEASON"]~.)

print("Create models with canonical link: LOG")
model.h2o.poisson.log <- h2o.glm(x=myX, y=myY, data=h2o.data, family="poisson", link="log",alpha=0.5, lambda=0, nfolds=0)
model.R.poisson.log <- glm(formula=R.formula, data=R.data[,1:8], family=poisson(link=log), na.action=na.omit)

print("Compare model deviances for link function log")
deviance.h2o.log = model.h2o.poisson.log@model$deviance / model.h2o.poisson.log@model$null
deviance.R.log = deviance(model.R.poisson.log) / model.h2o.poisson.log@model$null
difference = deviance.R.log - deviance.h2o.log
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.log))
print(cat("Deviance in R: ", deviance.R.log))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

print("Create models with link: IDENTITY")
model.h2o.poisson.identity <- h2o.glm(x=myX, y=myY, data=h2o.data, family="poisson", link="identity",alpha=0.5, lambda=0, nfolds=0)
model.R.poisson.identity <- glm(formula=R.formula, data=R.data[,1:8], family=poisson(link=identity), na.action=na.omit)

print("Compare model deviances for link function identity")
deviance.h2o.identity = model.h2o.poisson.identity@model$deviance / model.h2o.poisson.identity@model$null
deviance.R.identity = deviance(model.R.poisson.identity) / model.h2o.poisson.identity@model$null
difference = deviance.R.identity - deviance.h2o.identity
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.identity))
print(cat("Deviance in R: ", deviance.R.identity))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

testEnd()
}

doTest("Comparison of H2O to R with varying link functions for the POISSON family", test.linkFunctions)


42 changes: 42 additions & 0 deletions R/tests/testdir_algos/glm/runit_GLM_link_functions_tweedie.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
##
# Comparison of H2O to R with varying link functions for the TWEEDIE family on prostate dataset
# Link functions: tweedie (canonical link)
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')


test.linkFunctions <- function(conn) {

print("Read in prostate data.")
h2o.data = h2o.uploadFile(conn, locate("smalldata/prostate/prostate_complete.csv.zip"), key="h2o.data")
R.data = as.data.frame(as.matrix(h2o.data))

print("Testing for family: TWEEDIE")
print("Set variables for h2o.")
myY = "CAPSULE"
myX = c("AGE","RACE","DCAPS","PSA","VOL","DPROS","GLEASON")
print("Define formula for R")
R.formula = (R.data[,"CAPSULE"]~.)

print("Create models with canonical link: TWEEDIE")
model.h2o.tweedie.tweedie <- h2o.glm(x=myX, y=myY, data=h2o.data, family="tweedie", link="tweedie",alpha=0.5, lambda=0, nfolds=0)
model.R.tweedie.tweedie <- glm(formula=R.formula, data=R.data[,3:9], family=tweedie, na.action=na.omit)

print("Compare model deviances for link function tweedie")
deviance.h2o.tweedie = model.h2o.tweedie.tweedie@model$deviance / model.h2o.tweedie.tweedie@model$null
deviance.R.tweedie = deviance(model.R.tweedie.tweedie) / model.h2o.tweedie.tweedie@model$null
difference = deviance.R.tweedie - deviance.h2o.tweedie
if (difference > 0.01) {
print(cat("Deviance in H2O: ", deviance.h2o.tweedie))
print(cat("Deviance in R: ", deviance.R.tweedie))
checkTrue(difference <= 0.01, "h2o's model's residualDeviance/nullDeviance is more than 0.01 lower than R's model's")
}

testEnd()
}

doTest("Comparison of H2O to R with varying link functions for the TWEEDIE family", test.linkFunctions)


Binary file added smalldata/prostate/prostate_complete.csv.zip
Binary file not shown.

0 comments on commit 538bea0

Please sign in to comment.