summary test added, data for tested added, minor edits to demo from last week

dearirenelang · dearirenelang · commit 1f77cffe4a5e · 2014-02-17T10:53:27.000-08:00
diff --git a/R/examples/HUDdemo.R b/R/examples/HUDdemo.R
@@ -1,7 +1,7 @@
 library(h2o)
 h2o.server<- h2o.init()
 
-hud<- h2o.uploadFile(h2o.server, "../smalldata/hud.clean.csv")
+hud<- h2o.uploadFile(h2o.server, "h2o/smalldata/hud.clean.csv")
 
 #Poke around at the data, take a look at the variables
 head(hud)
@@ -27,13 +27,15 @@ nrow(hud.new)
 quantile(hud.new$RENT)
 summary(hud.new$RENT)
 quantile(hud.new$ZSMHC)
+
 summary(hud.new$ZSMHC)
 
-#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. 
+#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. (EG - if you look at the summary for $RENT- the 75th percentile is $1085, while the max is 5892. The rest of the data are clearly centered around a much lower value - making $5K a month for rent look like an outlier.)
 # If you look at a table for RENT - you can see that the data are pretty clumpy around round numbers (i.e., people are more likely to pay $700 for # rent 
 # than $678, or $723), and then there is a weird spike at the highest value of $5892 - about $3000 more than the second highest value. They might be #legitimate observations, but rents this high are well away from the rest of the distribution distribution, so we'll separate these highest values out for now, and consider the #rents that fall in the normal range. 
 as.data.frame(table(hud.new$RENT))
-hud.short<- hud.new[(hud.new$RENT< 3000),]
+hud.short<- hud.new[(hud.new$RENT< 3000),] #pull the extreme values of rent - 
+hud.high<- hud.new[(hud.new$RENT > 3000),] #pull the extreme values of rent  into their own data frame to look at later-
 summary(hud.short)
 
 #Running a quick Kmeans model allows us to further characterize: (for instance, note in the cluster generated below that the rents in the upper middle group # also have much lower incidence of income from social safety nets, and lower incidence of rodents. At the highest rents level the incidents of all of these # increase again, suggesting that higher rents are not necessarily an indicator of higher quality housing) 
@@ -58,15 +60,15 @@ hud.short.train<- hud.short[(hud.short[,71]<= .80),]
 hud.short.test<- hud.short[(hud.short[,71]> .80),]
 nrow(hud.short.train)
 nrow(hud.short.test)
+summary(hud.short.test)
 
 preds = c("REGMOR", "DIVISION", "REGION", "METRO", "STATE", "LMED", "LMEDA", "LMEDB", "FMR", "FMRA", "FMRB", "L30", "L50", "L80", "IPOV", "PER", "ZADULT", "ZINC", "ZINC2", "QSELF", "QSS", "QSSI", "QWELF", "QRETIR", "QWKCMP", "POOR", "VCHER", "VCHRMOV", "RENEW", "APPLY", "ROOMS", "PHONE", "KITCHEN", "PLUMB", "DISH", "WASH", "DRY", "OVEN", "COOK", "NUNIT2", "BATHS", "BEDRMS", "DENS", "DINING", "FAMRM", "HALFB", "KITCH", "LIVING" ,"OTHFN", "ELECT", "AIRSYS", "STOVE", "PORTH", "DISPL", "TRASH", "REFR", "TOILET", "TUB", "RATS", "MICE", "MOLD", "EROACH", "EVROD")
 L = c(seq(from= 0, to = 1, by= .01))
-hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .1), lambda = L, nfolds=0, data=hud.short.train)
+hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .5), lambda = L, nfolds=0, data=hud.short.train)
 hud.reg
 hud.best<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=0, lambda =0, nfolds=0, data=hud.short.train)
 hud.test<- h2o.predict(hud.best, hud.short.test)
 summary(hud.test)
 
 
 
-
diff --git a/R/tests/testdir_golden/runit_summary_golden.R b/R/tests/testdir_golden/runit_summary_golden.R
@@ -0,0 +1,57 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+test.summaryquantiles.golden <- function(H2Oserver) {
+
+#Import data: (the data are 20000 observations pulled from known distributions - parameters given at end of test)
+Log.info("Importing MAKE data...") 
+makeH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/makedata.csv"), key="makeH2O")
+makeR<- read.csv(locate("smalldata/makedata.csv"), header=T)
+
+
+#Obtain summary for both: 
+sumH2O<- summary(makeH2O)
+sumR<- summary(makeR)
+
+Log.info("Print summary for H2O and R... \n")
+Log.info(paste("H2O summary :",  sumH2O[,2]))
+Log.info(paste("R summary :",  sumR[,2]))
+Log.info(paste("H2O summary :",  sumH2O[,3]))
+Log.info(paste("R summary :",  sumR[,3]))
+Log.info(paste("H2O summary :",  sumH2O[,4]))
+Log.info(paste("R summary :",  sumR[,4]))
+Log.info(paste("H2O summary :",  sumH2O[,5]))
+Log.info(paste("R summary :",  sumR[,5]))
+Log.info(paste("H2O summary :",  sumH2O[,6]))
+Log.info(paste("R summary :",  sumR[,6]))
+Log.info(paste("H2O summary :",  sumH2O[,7]))
+Log.info(paste("R summary :",  sumR[,7]))
+Log.info(paste("H2O summary :",  sumR[,8]))
+Log.info(paste("R summary :",  sumR[,8]))
+Log.info(paste("H2O summary :",  sumR[,9]))
+Log.info(paste("R summary :",  sumR[,9]))
+
+
+
+Log.info("Compare H2O summary to R summary... \n")
+expect_equal(sumH2O[,2], sumR[,2], tolerance=.01)
+expect_equal(sumH2O[,3], sumR[,3], tolerance=.01)
+expect_equal(sumH2O[,4], sumR[,4], tolerance=.01)
+expect_equal(sumH2O[,5], sumR[,5], tolerance=.01)
+expect_equal(sumH2O[,6], sumR[,6], tolerance=.01)
+expect_equal(sumH2O[,7], sumR[,7], tolerance=.01)
+expect_equal(sumH2O[,8], sumR[,8], tolerance=.01)
+expect_equal(sumH2O[,9], sumR[,9], tolerance=.01)
+testEnd()
+}
+
+doTest("Summary and Quantiles", test.summaryquantiles.golden)
+
+#A: normal, mean: -100, sd = 50
+#B: uniform, min: -5000, max: 2000
+#C: poisson, lambda: 5
+#D: cauchy, location: 50, scale: 500
+#E: binom, size=100, prob=.1
+#F: binom, size=100, prob=.02
+#G: binom, size=10, prob=.01
+#H: exponential: rate= .4
diff --git a/h2o-docs/source/Ruser/R_studio.rst b/h2o-docs/source/Ruser/R_studio.rst
@@ -8,7 +8,7 @@ These instructions assume you are using R Studio 2.14.0 or later.
 
 **STEP 1**
 
-To use H2O in R, users need a copy of H2O. 
+To use H\ :sub: `2`\ O in R, users need a copy of  H\ :sub: `2`\ O. 
 The download package can be obtained by clicking on the button Download H\ :sub:`2`\ O at `http://0xdata.com/downloadtable <http://0xdata.com/downloadtable/>`_.
 
 Unzip the downloaded H\ :sub:`2`\ O zip file.
diff --git a/smalldata/makedata.csv b/smalldata/makedata.csv