Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
mmalohlava committed Feb 17, 2014
2 parents 3a917ed + a9866de commit 343da08
Show file tree
Hide file tree
Showing 19 changed files with 150,212 additions and 181 deletions.
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ docs-website: dw_announce dw_1 dw_2 dw_3 dw_4

docs-website-clean:
rm -rf h2o-docs/source/developuser/DocGen
rm -rf h2o-docs/source/developuser/ScalaGen
$(MAKE) -C h2o-docs clean
endif

Expand All @@ -206,6 +207,8 @@ dw_1:
mkdir -p h2o-docs/source/developuser/DocGen
cd h2o-docs/source/developuser/DocGen && java -Xmx1g -jar "$(TOPDIR)/target/h2o.jar" -runClass water.api.DocGen -port $(PORT) -name $(TMPDIR) -ice_root $(TMPDIR) 1> /dev/null
rm -rf $(TMPDIR)
mkdir -p h2o-docs/source/developuser/ScalaGen
cp -p h2o-scala/README.rst h2o-docs/source/developuser/ScalaGen/README.rst

# If this fails, you might need to do the following:
# $ (possibly sudo) easy_install pip
Expand All @@ -225,8 +228,6 @@ dw_3:
cp -p docs/H2O_on_Hadoop_0xdata.pdf $(BUILD_WEBSITE_DIR)/bits/hadoop
mkdir -p $(BUILD_WEBSITE_DIR)/bits/ec2
cp -p ec2/README.txt $(BUILD_WEBSITE_DIR)/bits/ec2
mkdir -p $(BUILD_WEBSITE_DIR)/bits/h2o-scala
cp -p h2o-scala/README.md $(BUILD_WEBSITE_DIR)/bits/h2o-scala/README.txt

# Note: to get pdfunite on a mac, try:
# $ brew install poppler
Expand Down
12 changes: 7 additions & 5 deletions R/examples/HUDdemo.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(h2o)
h2o.server<- h2o.init()

hud<- h2o.uploadFile(h2o.server, "../smalldata/hud.clean.csv")
hud<- h2o.uploadFile(h2o.server, "h2o/smalldata/hud.clean.csv")

#Poke around at the data, take a look at the variables
head(hud)
Expand All @@ -27,13 +27,15 @@ nrow(hud.new)
quantile(hud.new$RENT)
summary(hud.new$RENT)
quantile(hud.new$ZSMHC)

summary(hud.new$ZSMHC)

#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value.
#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. (EG - if you look at the summary for $RENT- the 75th percentile is $1085, while the max is 5892. The rest of the data are clearly centered around a much lower value - making $5K a month for rent look like an outlier.)
# If you look at a table for RENT - you can see that the data are pretty clumpy around round numbers (i.e., people are more likely to pay $700 for # rent
# than $678, or $723), and then there is a weird spike at the highest value of $5892 - about $3000 more than the second highest value. They might be #legitimate observations, but rents this high are well away from the rest of the distribution distribution, so we'll separate these highest values out for now, and consider the #rents that fall in the normal range.
as.data.frame(table(hud.new$RENT))
hud.short<- hud.new[(hud.new$RENT< 3000),]
hud.short<- hud.new[(hud.new$RENT< 3000),] #pull the extreme values of rent -
hud.high<- hud.new[(hud.new$RENT > 3000),] #pull the extreme values of rent into their own data frame to look at later-
summary(hud.short)

#Running a quick Kmeans model allows us to further characterize: (for instance, note in the cluster generated below that the rents in the upper middle group # also have much lower incidence of income from social safety nets, and lower incidence of rodents. At the highest rents level the incidents of all of these # increase again, suggesting that higher rents are not necessarily an indicator of higher quality housing)
Expand All @@ -58,15 +60,15 @@ hud.short.train<- hud.short[(hud.short[,71]<= .80),]
hud.short.test<- hud.short[(hud.short[,71]> .80),]
nrow(hud.short.train)
nrow(hud.short.test)
summary(hud.short.test)

preds = c("REGMOR", "DIVISION", "REGION", "METRO", "STATE", "LMED", "LMEDA", "LMEDB", "FMR", "FMRA", "FMRB", "L30", "L50", "L80", "IPOV", "PER", "ZADULT", "ZINC", "ZINC2", "QSELF", "QSS", "QSSI", "QWELF", "QRETIR", "QWKCMP", "POOR", "VCHER", "VCHRMOV", "RENEW", "APPLY", "ROOMS", "PHONE", "KITCHEN", "PLUMB", "DISH", "WASH", "DRY", "OVEN", "COOK", "NUNIT2", "BATHS", "BEDRMS", "DENS", "DINING", "FAMRM", "HALFB", "KITCH", "LIVING" ,"OTHFN", "ELECT", "AIRSYS", "STOVE", "PORTH", "DISPL", "TRASH", "REFR", "TOILET", "TUB", "RATS", "MICE", "MOLD", "EROACH", "EVROD")
L = c(seq(from= 0, to = 1, by= .01))
hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .1), lambda = L, nfolds=0, data=hud.short.train)
hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .5), lambda = L, nfolds=0, data=hud.short.train)
hud.reg
hud.best<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=0, lambda =0, nfolds=0, data=hud.short.train)
hud.test<- h2o.predict(hud.best, hud.short.test)
summary(hud.test)




25 changes: 10 additions & 15 deletions R/tests/testdir_golden/runit_km2_1_golden.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,24 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

test.km2vanilla.golden <- function(H2Oserver) {
# withinss addressed in JIRA 1489
#Import data:
#Log.info("Importing IRIS data...")
#irisH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/iris/iris.csv"), key="irisH2O")
#irisR<- read.csv(locate("smalldata/iris/iris.csv"), header=F)
Log.info("Importing IRIS data...")
irisH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/iris/iris.csv"), key="irisH2O")
irisR<- read.csv(locate("smalldata/iris/iris.csv"), header=F)


#fitR<- kmeans(irisR[,1:4], centers=3, iter.max=1000, nstart=10)
#fitH2O<- h2o.kmeans(irisH2O, centers=3, cols=c("C1", "C2", "C3", "C4"))
fitR<- kmeans(irisR[,1:4], centers=3, iter.max=1000, nstart=10)
fitH2O<- h2o.kmeans(irisH2O, centers=3, cols=c("C1", "C2", "C3", "C4"))

# Sanity check to make sure required fields are actually present in the model that gets returned.
#if (! ('withinss' %in% names(fitH2O@model))) {
# stop("H2O model has no component 'withinss'")
#}

#wssR<-sort.int(fitR$withinss)
#wssH2O<- sort.int(fitH2O@model$withinss)
wssR<-sort.int(fitR$withinss)
wssH2O<- sort.int(fitH2O@model$withinss)


#Log.info(paste("H2O WithinSS : ", wssH2O, "\t\t", "R WithinSS : ", wssR))
Log.info(paste("H2O WithinSS : ", wssH2O, "\t\t", "R WithinSS : ", wssR))

#Log.info("Compare Within SS between R and H2O")
#expect_equal(wssR, wssH2O, tolerance = 0.10)
Log.info("Compare Within SS between R and H2O")
expect_equal(wssR, wssH2O, tolerance = 0.10)

testEnd()
}
Expand Down
41 changes: 15 additions & 26 deletions R/tests/testdir_golden/runit_km2_2_golden.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,31 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

test.km2vanilla.golden <- function(H2Oserver) {
#within ss addressed in JIRA 1489

#Import Data:
#dummyH2O<- h2o.uploadFile.FV(H2Oserver, locate("../../smalldata/dummydata.csv"), key="dummyH2O")
#dummyR<- read.csv(locate("smalldata/dummydata.csv"), header=T)
dummyH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/dummydata.csv"), key="dummyH2O")
dummyR<- read.csv(locate("smalldata/dummydata.csv"), header=T)

#Remove unneeded cols
#dataR<- dummyR[,-1]
#dataH2O<- dummyH2O[,-1]

#Fit matching R and H2O models for k=2 on simple data
#fitR<- kmeans(dataR, centers=2)
#fitH2O<- h2o.kmeans.FV(dataH2O, centers=2)

# Not sure why building a kmeans cluster with one center is useful.
# But it found an interesting bug in summary2, so I'll leave it.
#fit2H2O<- h2o.kmeans.FV(dataH2O, centers=1)
#Fit matching R and H2O models for k=2 on simple data
fitR<- kmeans(dummyR[,2:3], centers=2)
fitH2O<- h2o.kmeans.FV(dummyH2O, centers=2, cols=c("V1", "V2"))

# Sanity check to make sure required fields are actually present in the model that gets returned.
#if (! ('withinss' %in% names(fitH2O@model))) {
# stop("H2O model has no component 'withinss'")
#}
# Build a 1 center model because that's the baseline against which K=n >1 will be compared
fit2H2O<- h2o.kmeans.FV(dummyH2O, centers=1)

#if (! ('totss' %in% names(fitH2O@model))) {
# stop("H2O model has no component 'totss'")
#}

#Log.info("Print model statistics for R and H2O... \n")
Log.info("Print model statistics for R and H2O... \n")
#Note that there are two "total" statistics: total within ss, and total ss. Total ss is the total variance in the whole data set, #is computed as the sum of the vector norms between each point and the data mean, and is equal to within cluster sum of squares #when k=1. As of Dec 21 K means in H2O does not produce total ss as an accessible metric in R, and does not model k=1 (known to #jira).
#Log.info(paste("H2O WithinSS : ", fitH2O@model$withinss, "\t\t", "R WithinSS : ", fitR$withinss))
#Log.info(paste("H2O TotalSS : ", fitH2O@model$totss, "\t\t", "R TotalSS : ", fitR$totss))
Log.info(paste("H2O WithinSS : ", fitH2O@model$withinss, "\t\t", "R WithinSS : ", fitR$withinss))
Log.info(paste("H2O TotalSS : ", fitH2O@model$totss, "\t\t", "R TotalSS : ", fitR$totss))

#Log.info("Compare model descriptives in R to model statistics in H2O")
#expect_equal(fitH2O@model$withinss, fitR$withinss, tolerance = 0.01)
#expect_equal(fitH2O@model$totss, fitR$totss, tolerance = 0.01)
expect_equal(fitH2O@model$withinss, fitR$withinss, tolerance = 0.01)
expect_equal(fitH2O@model$totss, fitR$totss, tolerance = 0.01)

#testEnd()
testEnd()
}

#doTest("K Means test on well separated dummy data example", test.km2vanilla.golden)
doTest("K Means test on well separated dummy data example", test.km2vanilla.golden)
57 changes: 57 additions & 0 deletions R/tests/testdir_golden/runit_summary_golden.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

test.summaryquantiles.golden <- function(H2Oserver) {

#Import data: (the data are 20000 observations pulled from known distributions - parameters given at end of test)
Log.info("Importing MAKE data...")
makeH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/makedata.csv"), key="makeH2O")
makeR<- read.csv(locate("smalldata/makedata.csv"), header=T)


#Obtain summary for both:
sumH2O<- summary(makeH2O)
sumR<- summary(makeR)

Log.info("Print summary for H2O and R... \n")
Log.info(paste("H2O summary :", sumH2O[,2]))
Log.info(paste("R summary :", sumR[,2]))
Log.info(paste("H2O summary :", sumH2O[,3]))
Log.info(paste("R summary :", sumR[,3]))
Log.info(paste("H2O summary :", sumH2O[,4]))
Log.info(paste("R summary :", sumR[,4]))
Log.info(paste("H2O summary :", sumH2O[,5]))
Log.info(paste("R summary :", sumR[,5]))
Log.info(paste("H2O summary :", sumH2O[,6]))
Log.info(paste("R summary :", sumR[,6]))
Log.info(paste("H2O summary :", sumH2O[,7]))
Log.info(paste("R summary :", sumR[,7]))
Log.info(paste("H2O summary :", sumR[,8]))
Log.info(paste("R summary :", sumR[,8]))
Log.info(paste("H2O summary :", sumR[,9]))
Log.info(paste("R summary :", sumR[,9]))



Log.info("Compare H2O summary to R summary... \n")
expect_equal(sumH2O[,2], sumR[,2], tolerance=.01)
expect_equal(sumH2O[,3], sumR[,3], tolerance=.01)
expect_equal(sumH2O[,4], sumR[,4], tolerance=.01)
expect_equal(sumH2O[,5], sumR[,5], tolerance=.01)
expect_equal(sumH2O[,6], sumR[,6], tolerance=.01)
expect_equal(sumH2O[,7], sumR[,7], tolerance=.01)
expect_equal(sumH2O[,8], sumR[,8], tolerance=.01)
expect_equal(sumH2O[,9], sumR[,9], tolerance=.01)
testEnd()
}

doTest("Summary and Quantiles", test.summaryquantiles.golden)

#A: normal, mean: -100, sd = 50
#B: uniform, min: -5000, max: 2000
#C: poisson, lambda: 5
#D: cauchy, location: 50, scale: 500
#E: binom, size=100, prob=.1
#F: binom, size=100, prob=.02
#G: binom, size=10, prob=.01
#H: exponential: rate= .4
2 changes: 1 addition & 1 deletion h2o-docs/source/Ruser/R_studio.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ These instructions assume you are using R Studio 2.14.0 or later.

**STEP 1**

To use H2O in R, users need a copy of H2O.
To use H\ :sub: `2`\ O in R, users need a copy of H\ :sub: `2`\ O.
The download package can be obtained by clicking on the button Download H\ :sub:`2`\ O at `http://0xdata.com/downloadtable <http://0xdata.com/downloadtable/>`_.

Unzip the downloaded H\ :sub:`2`\ O zip file.
Expand Down
9 changes: 0 additions & 9 deletions h2o-docs/source/developuser/quickstart_scala.rst

This file was deleted.

2 changes: 1 addition & 1 deletion h2o-docs/source/developuser/top_developer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ Getting Started with Development in H\ :sub:`2`\ O
quickstart_eclipse
quickstart_idea
quickstart_mac
quickstart_scala
ScalaGen/README
java
rest
2 changes: 1 addition & 1 deletion h2o-docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ This section is for new users, and provides information about how to install and
Ruser/Rinstall
Ruser/R_studio
newuser/ec2
developuser/quickstart_scala.rst
developuser/ScalaGen/README

.. toctree::
:maxdepth: 2
Expand Down
4 changes: 2 additions & 2 deletions h2o-scala/README.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

Shalala
=======
Scala for H\ :sub:`2`\ O: Shalala
===================================

Overview
--------
Expand Down
9 changes: 5 additions & 4 deletions py/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -1449,6 +1449,7 @@ def one_hot(self, source, timeoutSecs=30, **kwargs):

# &offset=
# &view=
# FIX! need to have max > 1000?
def inspect(self, key, offset=None, view=None, max_column_display=1000, ignoreH2oError=False,
timeoutSecs=30, useVA=False):
if beta_features and not useVA:
Expand Down Expand Up @@ -2142,16 +2143,16 @@ def summary_page(self, key, timeoutSecs=60, noPrint=True, useVA=False, numRows=N
'source': key,
'cols': None,
# h2o won't let me go bigger?
'max_ncols': 1000,
# 'max_ncols': 1000000,
# 'max_ncols': 1000,
'max_ncols': 1000000,
}
else:
params_dict = {
'key': key,
'x': None,
# h2o won't let me go bigger?
'max_column_display': 1000,
# 'max_column_display': 1000000,
# 'max_column_display': 1000,
'max_column_display': 1000000,
}
browseAlso = kwargs.pop('browseAlso',False)
check_params_update_kwargs(params_dict, kwargs, 'summary_page', print_params=True)
Expand Down
4 changes: 2 additions & 2 deletions py/h2o_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,10 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
else:
if not mins:
print h2o.dump_json(column)
raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, N, nacnt, numRows))
raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colName, nacnt, numRows))
if not maxs:
print h2o.dump_json(column)
raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, N, nacnt, numRows))
raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colName, nacnt, numRows))

hstart = column['hstart']
hstep = column['hstep']
Expand Down
13 changes: 2 additions & 11 deletions py/testdir_release/c7/test_c7_fvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@ def test_c7_rel(self):
print "For files that we want to put (for testing put), we can get non-private files"

csvFilename = 'part-00000b'
if getpass.getuser()=='kevin':
importFolderPath = '/home/hduser/data/'
else:
importFolderPath = '/mnt/0xcustomer-datasets/c2'

importFolderPath = '/mnt/0xcustomer-datasets/c2'
csvPathname = importFolderPath + "/" + csvFilename

# FIX! does 'separator=' take ints or ?? hex format
Expand All @@ -53,12 +49,7 @@ def test_c7_rel(self):
#summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
# summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
# can't do more than 1000
summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])

# Need to update this for new stuff
# leave off numCols so we don't check it vs. summary
h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numRows=numRows)

summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows)

keepPattern = "oly_|mt_|b_"
y = "is_purchase"
Expand Down
Loading

0 comments on commit 343da08

Please sign in to comment.