Skip to content

Commit

Permalink
updated code to match book
Browse files Browse the repository at this point in the history
  • Loading branch information
leslieahendrix committed Jan 8, 2022
1 parent f8c849a commit b3ec1f8
Show file tree
Hide file tree
Showing 22 changed files with 60,497 additions and 136 deletions.
2 changes: 1 addition & 1 deletion 1regression/Autoregressive.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### CHAPTER 3 - Space and Time #######
####### Regression #######

###airline passenger data

Expand Down
2 changes: 1 addition & 1 deletion 1regression/Hass.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### CHAPTER 1 - Regression #######
####### Regression #######
## Hass avocado panel example

hass <- read.csv("hass.csv",stringsAsFactors=TRUE)
Expand Down
2 changes: 1 addition & 1 deletion 1regression/Spacial.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### CHAPTER 1 - Regression #######
####### Regression #######
###### Gaussian Process Modeling

# California census data
Expand Down
2 changes: 1 addition & 1 deletion 1regression/Spam.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### CHAPTER 1 - Regression #######
####### Regression #######
###### Logistic Regression

### Spam Logistic Regression
Expand Down
2 changes: 1 addition & 1 deletion 1regression/OJ.R → 1regression/regressionOJ.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### CHAPTER 1 - Regression #######
####### Regression #######
###### Regression

### Dominick's OJ Regression
Expand Down
2 changes: 2 additions & 0 deletions 2uncertainty/SoCalCars.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
###### Uncertainty #######

# Import the data file on cars for sale into R
Cars <- read.csv("SoCalCars.csv", stringsAsFactors = TRUE)

Expand Down
2 changes: 2 additions & 0 deletions 3regularization/OutOfSample.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
###### REGULARIZATION #######
## the introductory overfit example

set.seed(865) # this allows you to get the same "random" data
x<-seq(-3,6,0.5) # define x's
data<-(1.5 + (x-1)^2) + rnorm(length(x),0,2.5) #a quadratic function with random noise added
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion 5experiments/RD.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
####### Experiments #######
####### Causal Inference - Experiments #######
####### RD #######

dat <- read.csv("RD.csv")
Expand Down
89 changes: 6 additions & 83 deletions 7trees/CalCensus.R
Original file line number Diff line number Diff line change
@@ -1,58 +1,10 @@
# California census data
ca <- read.csv("CalCensus.csv")
linc <- log(ca[,"medianIncome"])
lhval <- log(ca[,"medianHouseValue"])
summary(glm(lhval ~ linc))

# [relatively] fast GP fits
library(laGP)
s <- ca[,1:2] # long and lat
# fitting GP surfaces
gpinc <- aGP(s, linc, s, end=20, verb=0)
gphval <- aGP(s, lhval, s, end=20, verb=0)
# calculate residuals and regress
rinc <- linc - gpinc$mean
rhval <- lhval - gphval$mean
summary(glm(rhval ~ rinc))

### plots!
#png('CalIncFit.png', width=4, height=5, units="in", res=720)
plot(linc, gpinc$mean, col=rgb(0,0,.1,.25), pch=16, bty="n",
xlab="log Median Income", ylab="GP fitted value")
#dev.off()
#png('CalHValFit.png', width=4, height=5, units="in", res=720)
plot(lhval, gphval$mean, col=rgb(.1,0,0,.25), pch=16, bty="n",
xlab="log Median Home Value", ylab="GP fitted value")
#dev.off()
######################################
#### Trees and Forests
######################################

# maps package is fun. Check out fields for more
hvalBreaks <- quantile(ca$medianHouseValue,(0:20)/20)
hvalCut <- cut(ca$medianHouseValue,breaks=hvalBreaks)
hvalCols <- heat.colors(20)[as.numeric(hvalCut)]
################## California census data

incBreaks <- quantile(ca$medianIncome,(0:20)/20)
incCut <- cut(ca$medianIncome,breaks=incBreaks)
incCols <- heat.colors(20)[as.numeric(incCut)]

library(maps)
#png('CalHVal.png', width=4, height=5, units="in", res=720)
map('state', 'california')
points(ca[,1:2], col=hvalCols, pch=20)
legend("topright", title="Home Value",
legend=c("15k","120k","180k","265k","500k"),
fill=heat.colors(5), bty="n")
#dev.off()

#png('CalInc.png', width=4, height=5, units="in", res=720)
map('state', 'california')
points(ca[,1:2], col=incCols, pch=20)
legend("topright", title="Income",
legend=c("5k","26k","35k","47k","150k"),
fill=heat.colors(5), bty="n")
#dev.off()

######
# Forests
## Forests
ca <- read.csv("CalCensus.csv")

## First, lets do it with CART
Expand All @@ -65,11 +17,6 @@ plot(catree, col="grey50")
text(catree)
#dev.off()

## looks like the most complicated tree is best!
cvca <- cv.tree(catree)
cvca$size[which.min(cvca$dev)]
plot(cvca)

## Next, with random forest
## limit the number of trees and the minimum tree size for speed
## also run on 4 cores if you've got them
Expand Down Expand Up @@ -99,29 +46,5 @@ mtext("forest", line=1)
legend("topright", title="residuals", bty="n", pch=1,
pt.cex=c(2,1,1,2), col=c("black","black","red","red"), legend=c(2,1, -1,-2))
#dev.off()

## out of sample test run
MSE <- list(CART=NULL, RF=NULL)
splits <- sample(1:10, nrow(ca), replace=TRUE)
for(i in 1:10){
train <- which(splits!=i)
test <- which(splits==i)

rt <- tree(log(medianHouseValue) ~ ., data=ca[train,], mindev=1e-4)
yhat.rt <- predict(rt, newdata=ca[-train,])
MSE$CART <- c( MSE$CART,
var(log(ca$medianHouseValue)[-train] - yhat.rt))

rf <- ranger(log(medianHouseValue) ~ ., data=ca[train,],
num.tree=400, num.threads=4)
yhat.rf <- predict(rf, data=ca[-train,])$predictions
MSE$RF <- c( MSE$RF, var(log(ca$medianHouseValue)[-train] - yhat.rf) )

cat(i)
}
# results
lapply(MSE, mean)
# plot
#png('CalOOS.png', width=5, height=5, units="in", res=720)
boxplot(as.data.frame(MSE), col="dodgerblue", xlab="", ylab="MSE")
#########################################
#dev.off()
10 changes: 9 additions & 1 deletion 7trees/cps.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
######################################
#### Trees and Forests
######################################

################## CPS Census Income Data

library(hdm)
# Current Population Survey in 2012
# similar data used in mulligan+rubinstein (QJE: 2008)
Expand Down Expand Up @@ -59,6 +65,7 @@ wagetreecut <- prune.tree(wagetree, best=5)
plot(wagetreecut, col="grey50")
text(wagetreecut)
#dev.off()
#######################################

## random forest and variable importance
library(ranger)
Expand Down Expand Up @@ -96,4 +103,5 @@ glm(ytil ~ dtil-1)

# HTE
pefac <- cut(cps$pexp, c(0,10,30,Inf),right=FALSE)
coef(glm(ytil ~ dtil*pefac - pefac - dtil, data=cps))
head(pefac)
coef(glm(ytil ~ dtil*pefac - pefac - dtil - 1, data=cps))
25 changes: 25 additions & 0 deletions 8factors/bigstocks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
AAPL,830
GOOG,649
MSFT,568
FB,493
AMZN,470
JNJ,361
XOM,328
JPM,324
WMT,244
WFC,261
V,237
BAC,238
PG,236
T,234
GE,218
ORCL,203
CVX,202
DIS,158
BA,141
AMGN,125
CSCO,161
IBM,133
PFE,198
KO,197
PEP,170
37 changes: 0 additions & 37 deletions 8factors/guinness.R

This file was deleted.

Binary file removed 8factors/guinness.jpg
Binary file not shown.
6 changes: 6 additions & 0 deletions 8factors/protein.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
######################################
#### Factors
######################################

### *** European Protein Consumption, in grams/person-day *** ###
food <- read.csv("protein.csv", row.names=1) # 1st column is country name
head(food,4)

# fit the factors
pcfood <- prcomp(food, scale=TRUE)
Expand Down Expand Up @@ -28,6 +33,7 @@ plot(wfood[,3:4], type="n", xlim=c(-3,3),bty="n")
text(x=wfood[,3], y=wfood[,4], labels=rownames(food), col=grpProtein$cluster)
#dev.off()

summary(pcfood)
## Scree plot
#png('proteinScree.png', width=5, height=5, units="in", res=720)
plot(pcfood, main="European Protein PCA")
Expand Down
123 changes: 123 additions & 0 deletions 8factors/returns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
######################################
#### Factors
######################################

## analysis of monthly return
# (these are monthly returns minus the t-bill risk free rate of return)
R <- read.csv("returns.csv", row.names=1)

## pull out the S&P 500 and Amazon
head(R[,1:4],3)
sp500 <- R[,"SP500"]
ind <- which(colnames(R)=="CVX")
y <- R[,ind]
R <- R[,-c(1,ind)]
dim(R)

## look at big tech
round(R[1:3,c("AMZN","GOOG","AAPL","MSFT","FB")],4)
R[27:30,"FB",drop=FALSE]

## use a regularized EM algorithm from josse2016missmda
library(missMDA)
Ri <- imputePCA(R,npc=4)$completeObs
round(Ri[1:3,c("AMZN","GOOG","AAPL","MSFT","FB")], 4)

# use prcomp to fit the principal componets
retpc <- prcomp(Ri, scale=TRUE)
w <- predict(retpc)
phi <- retpc$rotation

## plot big stock scores
bigs <- read.csv("bigstocks.csv", header=FALSE)
bigs <- bigs[-which(bigs[,1]=="CVX"),]
#png('returnsPCA.png', width=10, height=5, units="in", res=720)
par(mfrow=c(1,2),xpd=TRUE)
plot(phi[bigs[,1],1:2], type="n", bty="n")
text(phi[bigs[,1],1:2], labels=bigs[,1], cex=bigs[,2]/350, col="navy")
plot(phi[bigs[,1],3:4], type="n", bty="n")
text(phi[bigs[,1],3:4], labels=bigs[,1], cex=bigs[,2]/350, col="navy")
#dev.off()

#png('returnsZvSNP.png', width=4.5, height=4.5, units="in", res=720)
plot(w[,1],sp500, bty="n", pch=20, xlab="PC1 monthly score", ylab="S&P500 montly return")
#dev.off()

## principal components regression
fit <- glm(y ~ PC1 + PC2 + PC3 + PC4, data=as.data.frame(w))
summary(fit)

# onto the lasso
library(gamlr)
set.seed(0)
alasso <- cv.gamlr(w, y, nfold=10)
B <- coef(alasso)[-1,]
B[B!=0]

# both raw stocks and the PCs
blasso <- cv.gamlr(cbind(w,Ri), y, foldid=alasso$foldid)
B <- coef(blasso)[-1,]
round(B[B!=0],5)

#png('returnsPCReg.png', width=10, height=5, units="in", res=720)
par(mfrow=c(1,2))
plot(alasso, ylim=c(0,0.004), bty="n")
mtext("PC Inputs",line=2, font=2, cex=1.1)
plot(blasso, ylim=c(0,0.004), bty="n")
mtext("PC+Stocks Inputs",line=2, font=2, cex=1.1)
#dev.off()

### marginal regression
phi <- cor(Ri, y)/apply(Ri,2,sd)
v <- Ri%*%phi
fwd <- glm(y ~ v)

#png('returnsMRG1.png', width=4.5, height=4.5, units="in", res=720)
plot(v, w[,1], bty="n", pch=20, xlab="MR factor", ylab="PC1")
#dev.off()
#png('returnsMRG2.png', width=4.5, height=4.5, units="in", res=720)
plot(fwd$fitted, y, pch=20, bty="n", xlab="MR fitted values")
#dev.off()


#### partial least squares
library(textir)
retpls <- pls(x=Ri, y=y, K=3)

#png('returnsPLS.png', width=10, height=4, units="in", res=720)
par(mfrow=c(1,3), mai=c(.7,.7,.1,.1))
plot(retpls, bty="n", cex.lab=1.4, pch=21, bg="yellow")
#dev.off()

## look at the values
phi <- retpls$loadings*apply(Ri,2,sd)
tail(phi[order(abs(phi[,1])),1])
tail(phi[order(abs(phi[,2])),2])
tail(phi[order(abs(phi[,3])),3])

## CV experiment
MSE <- matrix(nrow=10,ncol=5)
for(i in 1:10){
train <- which(alasso$foldid!=i)
test <- which(alasso$foldid==i)
for(k in 1:ncol(MSE)){
plsi <- pls(x=Ri[train,], y=y[train], K=k)
MSE[i,k] <- mean( (y[test] - predict(plsi, Ri[test,]))^2 )
}
cat(i)
}
colMeans(MSE)

MSE <- as.data.frame(MSE)
names(MSE) <- paste(1:ncol(MSE))
#png('returnsPLSOOS.png', width=4.5, height=4.5, units="in", res=720)
boxplot(MSE, col="yellow", ylab="mean square error", xlab="K", ylim=c(0,0.004))
abline(h=min(alasso$cvm), lty=2, col=2, lwd=1.5)
abline(h=min(blasso$cvm), lty=2, col=4, lwd=1.5)
#dev.off()

#png('returnsPLS2D.png', width=4.5, height=4.5, units="in", res=720)
par(xpd=TRUE)
plot(retpls$loadings[bigs[,1],1:2], type="n", bty="n", xlab="PLS(1)", ylab="PLS(2)")
text(retpls$loadings[bigs[,1],1:2], labels=bigs[,1], cex=bigs[,2]/350, col="navy")
#dev.off()
Loading

0 comments on commit b3ec1f8

Please sign in to comment.