Skip to content

Commit 1f77cff

Browse files
committed
summary test added, data for tested added, minor edits to demo from last week
1 parent 2e8ac76 commit 1f77cff

File tree

4 files changed

+20066
-107
lines changed

4 files changed

+20066
-107
lines changed

R/examples/HUDdemo.R

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
library(h2o)
22
h2o.server<- h2o.init()
33

4-
hud<- h2o.uploadFile(h2o.server, "../smalldata/hud.clean.csv")
4+
hud<- h2o.uploadFile(h2o.server, "h2o/smalldata/hud.clean.csv")
55

66
#Poke around at the data, take a look at the variables
77
head(hud)
@@ -27,13 +27,15 @@ nrow(hud.new)
2727
quantile(hud.new$RENT)
2828
summary(hud.new$RENT)
2929
quantile(hud.new$ZSMHC)
30+
3031
summary(hud.new$ZSMHC)
3132

32-
#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value.
33+
#Both DVs look like they have a strong skew to the right tail - driving the mean well above the median value. (EG - if you look at the summary for $RENT- the 75th percentile is $1085, while the max is 5892. The rest of the data are clearly centered around a much lower value - making $5K a month for rent look like an outlier.)
3334
# If you look at a table for RENT - you can see that the data are pretty clumpy around round numbers (i.e., people are more likely to pay $700 for # rent
3435
# than $678, or $723), and then there is a weird spike at the highest value of $5892 - about $3000 more than the second highest value. They might be #legitimate observations, but rents this high are well away from the rest of the distribution distribution, so we'll separate these highest values out for now, and consider the #rents that fall in the normal range.
3536
as.data.frame(table(hud.new$RENT))
36-
hud.short<- hud.new[(hud.new$RENT< 3000),]
37+
hud.short<- hud.new[(hud.new$RENT< 3000),] #pull the extreme values of rent -
38+
hud.high<- hud.new[(hud.new$RENT > 3000),] #pull the extreme values of rent into their own data frame to look at later-
3739
summary(hud.short)
3840

3941
#Running a quick Kmeans model allows us to further characterize: (for instance, note in the cluster generated below that the rents in the upper middle group # also have much lower incidence of income from social safety nets, and lower incidence of rodents. At the highest rents level the incidents of all of these # increase again, suggesting that higher rents are not necessarily an indicator of higher quality housing)
@@ -58,15 +60,15 @@ hud.short.train<- hud.short[(hud.short[,71]<= .80),]
5860
hud.short.test<- hud.short[(hud.short[,71]> .80),]
5961
nrow(hud.short.train)
6062
nrow(hud.short.test)
63+
summary(hud.short.test)
6164

6265
preds = c("REGMOR", "DIVISION", "REGION", "METRO", "STATE", "LMED", "LMEDA", "LMEDB", "FMR", "FMRA", "FMRB", "L30", "L50", "L80", "IPOV", "PER", "ZADULT", "ZINC", "ZINC2", "QSELF", "QSS", "QSSI", "QWELF", "QRETIR", "QWKCMP", "POOR", "VCHER", "VCHRMOV", "RENEW", "APPLY", "ROOMS", "PHONE", "KITCHEN", "PLUMB", "DISH", "WASH", "DRY", "OVEN", "COOK", "NUNIT2", "BATHS", "BEDRMS", "DENS", "DINING", "FAMRM", "HALFB", "KITCH", "LIVING" ,"OTHFN", "ELECT", "AIRSYS", "STOVE", "PORTH", "DISPL", "TRASH", "REFR", "TOILET", "TUB", "RATS", "MICE", "MOLD", "EROACH", "EVROD")
6366
L = c(seq(from= 0, to = 1, by= .01))
64-
hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .1), lambda = L, nfolds=0, data=hud.short.train)
67+
hud.reg<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=c(0, 0.001, .01, .5), lambda = L, nfolds=0, data=hud.short.train)
6568
hud.reg
6669
hud.best<- h2o.glm.FV(x=preds, y="ZSMHC", family="gaussian", standardize=T, alpha=0, lambda =0, nfolds=0, data=hud.short.train)
6770
hud.test<- h2o.predict(hud.best, hud.short.test)
6871
summary(hud.test)
6972

7073

7174

72-
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
2+
source('../findNSourceUtils.R')
3+
4+
test.summaryquantiles.golden <- function(H2Oserver) {
5+
6+
#Import data: (the data are 20000 observations pulled from known distributions - parameters given at end of test)
7+
Log.info("Importing MAKE data...")
8+
makeH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/makedata.csv"), key="makeH2O")
9+
makeR<- read.csv(locate("smalldata/makedata.csv"), header=T)
10+
11+
12+
#Obtain summary for both:
13+
sumH2O<- summary(makeH2O)
14+
sumR<- summary(makeR)
15+
16+
Log.info("Print summary for H2O and R... \n")
17+
Log.info(paste("H2O summary :", sumH2O[,2]))
18+
Log.info(paste("R summary :", sumR[,2]))
19+
Log.info(paste("H2O summary :", sumH2O[,3]))
20+
Log.info(paste("R summary :", sumR[,3]))
21+
Log.info(paste("H2O summary :", sumH2O[,4]))
22+
Log.info(paste("R summary :", sumR[,4]))
23+
Log.info(paste("H2O summary :", sumH2O[,5]))
24+
Log.info(paste("R summary :", sumR[,5]))
25+
Log.info(paste("H2O summary :", sumH2O[,6]))
26+
Log.info(paste("R summary :", sumR[,6]))
27+
Log.info(paste("H2O summary :", sumH2O[,7]))
28+
Log.info(paste("R summary :", sumR[,7]))
29+
Log.info(paste("H2O summary :", sumR[,8]))
30+
Log.info(paste("R summary :", sumR[,8]))
31+
Log.info(paste("H2O summary :", sumR[,9]))
32+
Log.info(paste("R summary :", sumR[,9]))
33+
34+
35+
36+
Log.info("Compare H2O summary to R summary... \n")
37+
expect_equal(sumH2O[,2], sumR[,2], tolerance=.01)
38+
expect_equal(sumH2O[,3], sumR[,3], tolerance=.01)
39+
expect_equal(sumH2O[,4], sumR[,4], tolerance=.01)
40+
expect_equal(sumH2O[,5], sumR[,5], tolerance=.01)
41+
expect_equal(sumH2O[,6], sumR[,6], tolerance=.01)
42+
expect_equal(sumH2O[,7], sumR[,7], tolerance=.01)
43+
expect_equal(sumH2O[,8], sumR[,8], tolerance=.01)
44+
expect_equal(sumH2O[,9], sumR[,9], tolerance=.01)
45+
testEnd()
46+
}
47+
48+
doTest("Summary and Quantiles", test.summaryquantiles.golden)
49+
50+
#A: normal, mean: -100, sd = 50
51+
#B: uniform, min: -5000, max: 2000
52+
#C: poisson, lambda: 5
53+
#D: cauchy, location: 50, scale: 500
54+
#E: binom, size=100, prob=.1
55+
#F: binom, size=100, prob=.02
56+
#G: binom, size=10, prob=.01
57+
#H: exponential: rate= .4

h2o-docs/source/Ruser/R_studio.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ These instructions assume you are using R Studio 2.14.0 or later.
88

99
**STEP 1**
1010

11-
To use H2O in R, users need a copy of H2O.
11+
To use H\ :sub: `2`\ O in R, users need a copy of H\ :sub: `2`\ O.
1212
The download package can be obtained by clicking on the button Download H\ :sub:`2`\ O at `http://0xdata.com/downloadtable <http://0xdata.com/downloadtable/>`_.
1313

1414
Unzip the downloaded H\ :sub:`2`\ O zip file.

0 commit comments

Comments
 (0)