Merge branch 'master' of github.com:0xdata/h2o

nagyistoce · Feb 17, 2014 · 343da08 · 343da08
2 parents 3a917ed + a9866de
commit 343da08
Show file tree

Hide file tree

Showing 19 changed files with 150,212 additions and 181 deletions.
diff --git a/Makefile b/Makefile
@@ -189,6 +189,7 @@ docs-website: dw_announce dw_1 dw_2 dw_3 dw_4
 
 docs-website-clean:
 	rm -rf h2o-docs/source/developuser/DocGen
+	rm -rf h2o-docs/source/developuser/ScalaGen
 	$(MAKE) -C h2o-docs clean
 endif
 
@@ -206,6 +207,8 @@ dw_1:
 	mkdir -p h2o-docs/source/developuser/DocGen
 	cd h2o-docs/source/developuser/DocGen && java -Xmx1g -jar "$(TOPDIR)/target/h2o.jar" -runClass water.api.DocGen -port $(PORT) -name $(TMPDIR) -ice_root $(TMPDIR) 1> /dev/null
 	rm -rf $(TMPDIR)
+	mkdir -p h2o-docs/source/developuser/ScalaGen
+	cp -p h2o-scala/README.rst h2o-docs/source/developuser/ScalaGen/README.rst
 
 # If this fails, you might need to do the following:
 #     $ (possibly sudo) easy_install pip
@@ -225,8 +228,6 @@ dw_3:
 	cp -p docs/H2O_on_Hadoop_0xdata.pdf $(BUILD_WEBSITE_DIR)/bits/hadoop
 	mkdir -p $(BUILD_WEBSITE_DIR)/bits/ec2
 	cp -p ec2/README.txt $(BUILD_WEBSITE_DIR)/bits/ec2
-	mkdir -p $(BUILD_WEBSITE_DIR)/bits/h2o-scala
-	cp -p h2o-scala/README.md $(BUILD_WEBSITE_DIR)/bits/h2o-scala/README.txt
 
 # Note:  to get pdfunite on a mac, try:
 #     $ brew install poppler

diff --git a/R/examples/HUDdemo.R b/R/examples/HUDdemo.R
@@ -1,7 +1,7 @@
 library(h2o)
 h2o.server<- h2o.init()
 
-hud<- h2o.uploadFile(h2o.server, "../smalldata/hud.clean.csv")
+hud<- h2o.uploadFile(h2o.server, "h2o/smalldata/hud.clean.csv")
 
 #Poke around at the data, take a look at the variables
 head(hud)
@@ -27,13 +27,15 @@ nrow(hud.new)
 quantile(hud.new$RENT)
 summary(hud.new$RENT)
 quantile(hud.new$ZSMHC)
+
 summary(hud.new$ZSMHC)
 
-#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. 
+#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. (EG - if you look at the summary for $RENT- the 75th percentile is $1085, while the max is 5892. The rest of the data are clearly centered around a much lower value - making $5K a month for rent look like an outlier.)
 # If you look at a table for RENT - you can see that the data are pretty clumpy around round numbers (i.e., people are more likely to pay $700 for # rent 
 # than $678, or $723), and then there is a weird spike at the highest value of $5892 - about $3000 more than the second highest value. They might be #legitimate observations, but rents this high are well away from the rest of the distribution distribution, so we'll separate these highest values out for now, and consider the #rents that fall in the normal range. 
 as.data.frame(table(hud.new$RENT))
-hud.short<- hud.new[(hud.new$RENT< 3000),]
+hud.short<- hud.new[(hud.new$RENT< 3000),] #pull the extreme values of rent - 
+hud.high<- hud.new[(hud.new$RENT > 3000),] #pull the extreme values of rent  into their own data frame to look at later-
 summary(hud.short)
 
 #Running a quick Kmeans model allows us to further characterize: (for instance, note in the cluster generated below that the rents in the upper middle group # also have much lower incidence of income from social safety nets, and lower incidence of rodents. At the highest rents level the incidents of all of these # increase again, suggesting that higher rents are not necessarily an indicator of higher quality housing) 
@@ -58,15 +60,15 @@ hud.short.train<- hud.short[(hud.short[,71]<= .80),]
 hud.short.test<- hud.short[(hud.short[,71]> .80),]
 nrow(hud.short.train)
 nrow(hud.short.test)
+summary(hud.short.test)
 
 preds = c("REGMOR", "DIVISION", "REGION", "METRO", "STATE", "LMED", "LMEDA", "LMEDB", "FMR", "FMRA", "FMRB", "L30", "L50", "L80", "IPOV", "PER", "ZADULT", "ZINC", "ZINC2", "QSELF", "QSS", "QSSI", "QWELF", "QRETIR", "QWKCMP", "POOR", "VCHER", "VCHRMOV", "RENEW", "APPLY", "ROOMS", "PHONE", "KITCHEN", "PLUMB", "DISH", "WASH", "DRY", "OVEN", "COOK", "NUNIT2", "BATHS", "BEDRMS", "DENS", "DINING", "FAMRM", "HALFB", "KITCH", "LIVING" ,"OTHFN", "ELECT", "AIRSYS", "STOVE", "PORTH", "DISPL", "TRASH", "REFR", "TOILET", "TUB", "RATS", "MICE", "MOLD", "EROACH", "EVROD")
 L = c(seq(from= 0, to = 1, by= .01))
-hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .1), lambda = L, nfolds=0, data=hud.short.train)
+hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .5), lambda = L, nfolds=0, data=hud.short.train)
 hud.reg
 hud.best<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=0, lambda =0, nfolds=0, data=hud.short.train)
 hud.test<- h2o.predict(hud.best, hud.short.test)
 summary(hud.test)
 
 
 
-
diff --git a/R/tests/testdir_golden/runit_km2_1_golden.R b/R/tests/testdir_golden/runit_km2_1_golden.R
@@ -2,29 +2,24 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
 source('../findNSourceUtils.R')
 
 test.km2vanilla.golden <- function(H2Oserver) {
-# withinss addressed in JIRA 1489
 #Import data: 
-#Log.info("Importing IRIS data...") 
-#irisH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/iris/iris.csv"), key="irisH2O")
-#irisR<- read.csv(locate("smalldata/iris/iris.csv"), header=F)
+Log.info("Importing IRIS data...") 
+irisH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/iris/iris.csv"), key="irisH2O")
+irisR<- read.csv(locate("smalldata/iris/iris.csv"), header=F)
 
 
-#fitR<- kmeans(irisR[,1:4], centers=3, iter.max=1000, nstart=10)
-#fitH2O<- h2o.kmeans(irisH2O, centers=3, cols=c("C1", "C2", "C3", "C4"))
+fitR<- kmeans(irisR[,1:4], centers=3, iter.max=1000, nstart=10)
+fitH2O<- h2o.kmeans(irisH2O, centers=3, cols=c("C1", "C2", "C3", "C4"))
 
-# Sanity check to make sure required fields are actually present in the model that gets returned.
-#if (! ('withinss' %in% names(fitH2O@model))) {
-# stop("H2O model has no component 'withinss'")
-#}
 
-#wssR<-sort.int(fitR$withinss)
-#wssH2O<- sort.int(fitH2O@model$withinss)
+wssR<-sort.int(fitR$withinss)
+wssH2O<- sort.int(fitH2O@model$withinss)
 
 
-#Log.info(paste("H2O WithinSS  : ", wssH2O, "\t\t", "R WithinSS : ", wssR))
+Log.info(paste("H2O WithinSS  : ", wssH2O, "\t\t", "R WithinSS : ", wssR))
 
-#Log.info("Compare Within SS between R and H2O")
-#expect_equal(wssR, wssH2O, tolerance = 0.10)
+Log.info("Compare Within SS between R and H2O")
+expect_equal(wssR, wssH2O, tolerance = 0.10)
 
 testEnd()
 }

diff --git a/R/tests/testdir_golden/runit_km2_2_golden.R b/R/tests/testdir_golden/runit_km2_2_golden.R
@@ -2,42 +2,31 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
 source('../findNSourceUtils.R')
 
 test.km2vanilla.golden <- function(H2Oserver) {
-#within ss addressed in JIRA 1489
+
 #Import Data:
-#dummyH2O<- h2o.uploadFile.FV(H2Oserver, locate("../../smalldata/dummydata.csv"), key="dummyH2O")
-#dummyR<- read.csv(locate("smalldata/dummydata.csv"), header=T)
+dummyH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/dummydata.csv"), key="dummyH2O")
+dummyR<- read.csv(locate("smalldata/dummydata.csv"), header=T)
 
-#Remove unneeded cols
-#dataR<- dummyR[,-1]
-#dataH2O<- dummyH2O[,-1]
 
-#Fit matching R and H2O models for k=2 on simple data
-#fitR<- kmeans(dataR, centers=2)
-#fitH2O<- h2o.kmeans.FV(dataH2O, centers=2)
 
-# Not sure why building a kmeans cluster with one center is useful.
-# But it found an interesting bug in summary2, so I'll leave it.
-#fit2H2O<- h2o.kmeans.FV(dataH2O, centers=1)
+#Fit matching R and H2O models for k=2 on simple data
+fitR<- kmeans(dummyR[,2:3], centers=2)
+fitH2O<- h2o.kmeans.FV(dummyH2O, centers=2, cols=c("V1", "V2"))
 
-# Sanity check to make sure required fields are actually present in the model that gets returned.
-#if (! ('withinss' %in% names(fitH2O@model))) {
- # stop("H2O model has no component 'withinss'")
-#}
+# Build a 1 center model because that's the baseline against which K=n >1 will be compared
+fit2H2O<- h2o.kmeans.FV(dummyH2O, centers=1)
 
-#if (! ('totss' %in% names(fitH2O@model))) {
- # stop("H2O model has no component 'totss'")
-#}
 
-#Log.info("Print model statistics for R and H2O... \n")
+Log.info("Print model statistics for R and H2O... \n")
 #Note that there are two "total" statistics: total within ss, and total ss. Total ss is the total variance in the whole data set, #is computed as the sum of the vector norms between each point and the data mean, and is equal to within cluster sum of squares #when k=1. As of Dec 21 K means in H2O does not produce total ss as an accessible metric in R, and does not model k=1 (known to #jira).
-#Log.info(paste("H2O WithinSS  : ", fitH2O@model$withinss, "\t\t", "R WithinSS   : ", fitR$withinss))
-#Log.info(paste("H2O TotalSS   : ", fitH2O@model$totss,    "\t\t", "R TotalSS    : ", fitR$totss))
+Log.info(paste("H2O WithinSS  : ", fitH2O@model$withinss, "\t\t", "R WithinSS   : ", fitR$withinss))
+Log.info(paste("H2O TotalSS   : ", fitH2O@model$totss,    "\t\t", "R TotalSS    : ", fitR$totss))
 
 #Log.info("Compare model descriptives in R to model statistics in H2O")
-#expect_equal(fitH2O@model$withinss, fitR$withinss, tolerance = 0.01)
-#expect_equal(fitH2O@model$totss,    fitR$totss,    tolerance = 0.01)
+expect_equal(fitH2O@model$withinss, fitR$withinss, tolerance = 0.01)
+expect_equal(fitH2O@model$totss,    fitR$totss,    tolerance = 0.01)
 
-#testEnd()
+testEnd()
 }
 
-#doTest("K Means test on well separated dummy data example", test.km2vanilla.golden)
+doTest("K Means test on well separated dummy data example", test.km2vanilla.golden)
diff --git a/R/tests/testdir_golden/runit_summary_golden.R b/R/tests/testdir_golden/runit_summary_golden.R
@@ -0,0 +1,57 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+test.summaryquantiles.golden <- function(H2Oserver) {
+
+#Import data: (the data are 20000 observations pulled from known distributions - parameters given at end of test)
+Log.info("Importing MAKE data...") 
+makeH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/makedata.csv"), key="makeH2O")
+makeR<- read.csv(locate("smalldata/makedata.csv"), header=T)
+
+
+#Obtain summary for both: 
+sumH2O<- summary(makeH2O)
+sumR<- summary(makeR)
+
+Log.info("Print summary for H2O and R... \n")
+Log.info(paste("H2O summary :",  sumH2O[,2]))
+Log.info(paste("R summary :",  sumR[,2]))
+Log.info(paste("H2O summary :",  sumH2O[,3]))
+Log.info(paste("R summary :",  sumR[,3]))
+Log.info(paste("H2O summary :",  sumH2O[,4]))
+Log.info(paste("R summary :",  sumR[,4]))
+Log.info(paste("H2O summary :",  sumH2O[,5]))
+Log.info(paste("R summary :",  sumR[,5]))
+Log.info(paste("H2O summary :",  sumH2O[,6]))
+Log.info(paste("R summary :",  sumR[,6]))
+Log.info(paste("H2O summary :",  sumH2O[,7]))
+Log.info(paste("R summary :",  sumR[,7]))
+Log.info(paste("H2O summary :",  sumR[,8]))
+Log.info(paste("R summary :",  sumR[,8]))
+Log.info(paste("H2O summary :",  sumR[,9]))
+Log.info(paste("R summary :",  sumR[,9]))
+
+
+
+Log.info("Compare H2O summary to R summary... \n")
+expect_equal(sumH2O[,2], sumR[,2], tolerance=.01)
+expect_equal(sumH2O[,3], sumR[,3], tolerance=.01)
+expect_equal(sumH2O[,4], sumR[,4], tolerance=.01)
+expect_equal(sumH2O[,5], sumR[,5], tolerance=.01)
+expect_equal(sumH2O[,6], sumR[,6], tolerance=.01)
+expect_equal(sumH2O[,7], sumR[,7], tolerance=.01)
+expect_equal(sumH2O[,8], sumR[,8], tolerance=.01)
+expect_equal(sumH2O[,9], sumR[,9], tolerance=.01)
+testEnd()
+}
+
+doTest("Summary and Quantiles", test.summaryquantiles.golden)
+
+#A: normal, mean: -100, sd = 50
+#B: uniform, min: -5000, max: 2000
+#C: poisson, lambda: 5
+#D: cauchy, location: 50, scale: 500
+#E: binom, size=100, prob=.1
+#F: binom, size=100, prob=.02
+#G: binom, size=10, prob=.01
+#H: exponential: rate= .4
diff --git a/h2o-docs/source/Ruser/R_studio.rst b/h2o-docs/source/Ruser/R_studio.rst
@@ -8,7 +8,7 @@ These instructions assume you are using R Studio 2.14.0 or later.
 
 **STEP 1**
 
-To use H2O in R, users need a copy of H2O. 
+To use H\ :sub: `2`\ O in R, users need a copy of  H\ :sub: `2`\ O. 
 The download package can be obtained by clicking on the button Download H\ :sub:`2`\ O at `http://0xdata.com/downloadtable <http://0xdata.com/downloadtable/>`_.
 
 Unzip the downloaded H\ :sub:`2`\ O zip file.

diff --git a/h2o-docs/source/developuser/quickstart_scala.rst b/h2o-docs/source/developuser/quickstart_scala.rst
diff --git a/h2o-docs/source/developuser/top_developer.rst b/h2o-docs/source/developuser/top_developer.rst
@@ -12,6 +12,6 @@ Getting Started with Development in H\ :sub:`2`\ O
    quickstart_eclipse
    quickstart_idea
    quickstart_mac
-   quickstart_scala
+   ScalaGen/README
    java
    rest
diff --git a/h2o-docs/source/index.rst b/h2o-docs/source/index.rst
@@ -22,7 +22,7 @@ This section is for new users, and provides information about how to install and
    Ruser/Rinstall
    Ruser/R_studio
    newuser/ec2
-   developuser/quickstart_scala.rst
+   developuser/ScalaGen/README
 
 .. toctree::
    :maxdepth: 2

diff --git a/h2o-scala/README.rst b/h2o-scala/README.rst
@@ -1,6 +1,6 @@
 
-Shalala
-=======
+Scala for H\ :sub:`2`\ O: Shalala
+===================================
 
 Overview
 --------

diff --git a/py/h2o.py b/py/h2o.py
@@ -1449,6 +1449,7 @@ def one_hot(self, source, timeoutSecs=30, **kwargs):
 
     # &offset=
     # &view=
+    # FIX! need to have max > 1000? 
     def inspect(self, key, offset=None, view=None, max_column_display=1000, ignoreH2oError=False, 
         timeoutSecs=30, useVA=False):
         if beta_features and not useVA:
@@ -2142,16 +2143,16 @@ def summary_page(self, key, timeoutSecs=60, noPrint=True, useVA=False, numRows=N
                 'source': key,
                 'cols': None,
                 # h2o won't let me go bigger?
-                'max_ncols': 1000,
-                # 'max_ncols': 1000000,
+                # 'max_ncols': 1000,
+                'max_ncols': 1000000,
                 }
         else:
             params_dict = {
                 'key': key,
                 'x': None,
                 # h2o won't let me go bigger?
-                'max_column_display': 1000,
-                # 'max_column_display': 1000000,
+                # 'max_column_display': 1000,
+                'max_column_display': 1000000,
                 }
         browseAlso = kwargs.pop('browseAlso',False)
         check_params_update_kwargs(params_dict, kwargs, 'summary_page', print_params=True)

diff --git a/py/h2o_cmd.py b/py/h2o_cmd.py
@@ -406,10 +406,10 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
                 else:
                     if not mins:
                         print h2o.dump_json(column)
-                        raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, N, nacnt, numRows))
+                        raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colName, nacnt, numRows))
                     if not maxs:
                         print h2o.dump_json(column)
-                        raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, N, nacnt, numRows))
+                        raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colName, nacnt, numRows))
 
             hstart = column['hstart']
             hstep = column['hstep']

diff --git a/py/testdir_release/c7/test_c7_fvec.py b/py/testdir_release/c7/test_c7_fvec.py
@@ -24,11 +24,7 @@ def test_c7_rel(self):
         print "For files that we want to put (for testing put), we can get non-private files"
 
         csvFilename = 'part-00000b'
-        if getpass.getuser()=='kevin':
-            importFolderPath = '/home/hduser/data/'
-        else:
-            importFolderPath = '/mnt/0xcustomer-datasets/c2'
-
+        importFolderPath = '/mnt/0xcustomer-datasets/c2'
         csvPathname = importFolderPath + "/" + csvFilename
 
         # FIX! does 'separator=' take ints or ?? hex format
@@ -53,12 +49,7 @@ def test_c7_rel(self):
         #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
         # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
         # can't do more than 1000
-        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
-
-        # Need to update this for new stuff
-        # leave off numCols so we don't check it vs. summary
-        h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numRows=numRows)
-
+        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows)
 
         keepPattern = "oly_|mt_|b_"
         y = "is_purchase"