Skip to content

Commit

Permalink
Minor test tweaks to make it easier to demo offline.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkraljevic committed Mar 2, 2015
1 parent 878c2fc commit 8d6764a
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 19 deletions.
18 changes: 12 additions & 6 deletions R/tests/testdir_demos/runit_demo_VI_all_algos.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ test <- function(h) {
# Parse data into H2O
print("Parsing data into H2O")
# From an h2o git workspace.
data.hex = h2o.importFile(h, normalizePath(locate("smalldata/bank-additional-full.csv")), key="data.hex")
if (FALSE) {
h = h2o.init()
data.hex = h2o.importFile(h, "/Users/tomk/0xdata/ws/h2o/smalldata/bank-additional-full.csv", key="data.hex")
}
else {
data.hex = h2o.importFile(h, normalizePath(locate("smalldata/bank-additional-full.csv")), key="data.hex")
}
# Or directly from github.
# data.hex = h2o.importFile(h, path = "https://raw.github.com/0xdata/h2o/master/smalldata/bank-additional-full.csv", key="data.hex")

Expand All @@ -31,7 +37,7 @@ myY="y"

# Run GBM with variable importance
my.gbm <- h2o.gbm(x = myX, y = myY, distribution = "bernoulli", data = data.hex, n.trees =100,
interaction.depth = 2, shrinkage = 0.01, importance = T)
interaction.depth = 2, shrinkage = 0.01, importance = T)

# Access Variable Importance from the built model
gbm.VI = my.gbm@model$varimp
Expand Down Expand Up @@ -73,18 +79,18 @@ my.glm = h2o.glm(x=myX, y=myY, data=data.hex, family="binomial",standardize=T,us
# Select the best model picked by glm
best_model = my.glm@best_model

# Get the normalized coefficients of the best model
n_coeff = abs(my.glm@models[[best_model]]@model$normalized_coefficients)
# Get the normalized coefficients of the best model
n_coeff = abs(my.glm@models[[best_model]]@model$normalized_coefficients)

# Access Variable Importance by removing the intercept term
VI = abs(n_coeff[-length(n_coeff)])
VI = abs(n_coeff[-length(n_coeff)])

glm.VI = VI[order(VI,decreasing=T)]
print("Variable importance from GLM")
print(glm.VI)

# Plot variable importance from glm
barplot(glm.VI[1:20],las=2,main="VI from GLM")
barplot(glm.VI[1:20],las=2,main="VI from GLM")

#--------------------------------------------------
# Run deeplearning with variable importance
Expand Down
31 changes: 23 additions & 8 deletions R/tests/testdir_demos/runit_demo_tk_cm_roc.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ if (TRUE) {

if (FALSE) {
setwd("/Users/tomk/0xdata/ws/h2o/R/tests/testdir_demos")
filePath <- "/Users/tomk/0xdata/ws/h2o/smalldata/airlines/AirlinesTrain.csv.zip"
testFilePath <- "/Users/tomk/0xdata/ws/h2o/smalldata/airlines/AirlinesTest.csv.zip"
}

source('../findNSourceUtils.R')
Expand All @@ -23,8 +25,8 @@ if (TRUE) {
testFilePath <- normalizePath(locate("smalldata/airlines/AirlinesTest.csv.zip"))
} else {
stop("need to hardcode ip and port")
# myIP = "127.0.0.1"
# myPort = 54321
myIP = "127.0.0.1"
myPort = 54321

library(h2o)
PASS_BANNER <- function() { cat("\nPASS\n\n") }
Expand All @@ -48,7 +50,7 @@ myX = c("Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
myY="IsDepDelayed"

#gbm
air.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = air.train, n.trees = 10,
air.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = air.train, n.trees = 10,
interaction.depth = 3, shrinkage = 0.01, n.bins = 100, validation = air.valid, importance = T)
print(air.gbm@model)
air.gbm@model$auc
Expand All @@ -62,7 +64,7 @@ air.test=h2o.importFile(conn,testFilePath,key="air.test")

model_object=air.rf #air.glm air.rf air.dl

#predicting on test file
#predicting on test file
pred = h2o.predict(model_object,air.test)
head(pred)

Expand All @@ -80,14 +82,27 @@ plot(perf,type="roc")

PASS_BANNER()

if (FALSE) {
h = h2o.init(ip="mr-0xb1", port=60024)
BIGDATA = FALSE
if (BIGDATA) {
h = h2o.init(ip="172.16.2.190", port=60024)
df = h2o.importFile(h, "/home/tomk/airlines_all.csv")
nrow(df)
ncol(df)
head(df)

s = h2o.runif(df) # Useful when number of rows too large for R to handle
air.train = df[s <= 0.8,]
air.test = df[s > 0.8,]

myX = c("Origin", "Dest", "Distance", "UniqueCarrier", "Month", "DayofMonth", "DayOfWeek")
myY = "IsDepDelayed"
air.glm = h2o.glm(x = myX, y = myY, data = df, family = "binomial", nfolds = 10, alpha = 0.25, lambda = 0.001)
air.glm@model$confusion
air.glm = h2o.glm(x = myX, y = myY, data = air.train,
family = "binomial", nfolds = 1, alpha = 0.25, lambda = 0.001)

pred = h2o.predict(air.glm, air.test)
dim(pred)
head(pred)
perf = h2o.performance(pred$YES,air.test$IsDepDelayed)
perf
plot(perf,type="roc")
}
11 changes: 6 additions & 5 deletions R/tests/testdir_demos/runit_demo_tk_steam.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,20 @@
if (TRUE) {
# Set working directory so that the source() below works.
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))

if (FALSE) {
setwd("/Users/tomk/0xdata/ws/h2o/R/tests/testdir_demos")
filePath <- "/Users/tomk/0xdata/ws/h2o/smalldata/airlines/allyears2k_headers.zip"
}

source('../findNSourceUtils.R')
options(echo=TRUE)
filePath <- normalizePath(locate("smalldata/airlines/allyears2k_headers.zip"))
} else {
stop("need to hardcode ip and port")
# myIP = "127.0.0.1"
# myPort = 54321
myIP = "127.0.0.1"
myPort = 54321

library(h2o)
PASS_BANNER <- function() { cat("\nPASS\n\n") }
filePath <- "https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip"
Expand Down

0 comments on commit 8d6764a

Please sign in to comment.