updated code to match book

TaddyLab · Jan 8, 2022 · b3ec1f8 · b3ec1f8
1 parent f8c849a
commit b3ec1f8
Show file tree

Hide file tree

Showing 22 changed files with 60,497 additions and 136 deletions.
diff --git a/1regression/Autoregressive.R b/1regression/Autoregressive.R
@@ -1,4 +1,4 @@
-####### CHAPTER 3 - Space and Time #######
+####### Regression #######
 
 ###airline passenger data
 

diff --git a/1regression/Hass.R b/1regression/Hass.R
@@ -1,4 +1,4 @@
-####### CHAPTER 1 - Regression #######
+####### Regression #######
 ## Hass avocado panel example
 
 hass <- read.csv("hass.csv",stringsAsFactors=TRUE)

diff --git a/1regression/Spacial.R b/1regression/Spacial.R
@@ -1,4 +1,4 @@
-####### CHAPTER 1 - Regression #######
+####### Regression #######
 ###### Gaussian Process Modeling
 
 # California census data

diff --git a/1regression/Spam.R b/1regression/Spam.R
@@ -1,4 +1,4 @@
-####### CHAPTER 1 - Regression #######
+####### Regression #######
 ###### Logistic Regression
 
 ### Spam Logistic Regression

diff --git a/1regression/OJ.R → 1regression/regressionOJ.R b/1regression/OJ.R → 1regression/regressionOJ.R
@@ -1,4 +1,4 @@
-####### CHAPTER 1 - Regression #######
+####### Regression #######
 ###### Regression
 
 ### Dominick's OJ Regression

diff --git a/2uncertainty/SoCalCars.R b/2uncertainty/SoCalCars.R
@@ -1,3 +1,5 @@
+###### Uncertainty #######
+
 # Import the data file on cars for sale into R
 Cars <- read.csv("SoCalCars.csv", stringsAsFactors = TRUE)
 

diff --git a/3regularization/OutOfSample.R b/3regularization/OutOfSample.R
@@ -1,4 +1,6 @@
+###### REGULARIZATION #######
 ## the introductory overfit example
+
 set.seed(865) # this allows you to get the same "random" data
 x<-seq(-3,6,0.5) # define x's
 data<-(1.5 + (x-1)^2) + rnorm(length(x),0,2.5) #a quadratic function with random noise added

diff --git a/3regularization/OJ.R → 3regularization/regularizationOJ.R b/3regularization/OJ.R → 3regularization/regularizationOJ.R
diff --git a/5experiments/RD.R b/5experiments/RD.R
@@ -1,4 +1,4 @@
-####### Experiments #######
+####### Causal Inference - Experiments #######
 ####### RD #######
 
 dat <- read.csv("RD.csv")

diff --git a/7trees/CalCensus.R b/7trees/CalCensus.R
@@ -1,58 +1,10 @@
-# California census data
-ca <- read.csv("CalCensus.csv")
-linc <- log(ca[,"medianIncome"])
-lhval <- log(ca[,"medianHouseValue"]) 
-summary(glm(lhval ~ linc))
-
-# [relatively] fast GP fits
-library(laGP)
-s <- ca[,1:2] # long and lat
-# fitting GP surfaces
-gpinc <- aGP(s, linc, s, end=20, verb=0)
-gphval <- aGP(s, lhval, s, end=20, verb=0)
-# calculate residuals and regress
-rinc <- linc - gpinc$mean
-rhval <- lhval - gphval$mean
-summary(glm(rhval ~ rinc))
-
-### plots!  
-#png('CalIncFit.png', width=4, height=5, units="in", res=720)
-plot(linc, gpinc$mean, col=rgb(0,0,.1,.25), pch=16, bty="n", 
-	xlab="log Median Income", ylab="GP fitted value")
-#dev.off()
-#png('CalHValFit.png', width=4, height=5, units="in", res=720)
-plot(lhval, gphval$mean, col=rgb(.1,0,0,.25), pch=16, bty="n", 
-	xlab="log Median Home Value", ylab="GP fitted value")
-#dev.off()
+######################################
+#### Trees and Forests
+######################################
 
-# maps package is fun.  Check out fields for more 
-hvalBreaks <- quantile(ca$medianHouseValue,(0:20)/20)
-hvalCut <- cut(ca$medianHouseValue,breaks=hvalBreaks)
-hvalCols <- heat.colors(20)[as.numeric(hvalCut)]
+################## California census data
 
-incBreaks <- quantile(ca$medianIncome,(0:20)/20)
-incCut <- cut(ca$medianIncome,breaks=incBreaks)
-incCols <- heat.colors(20)[as.numeric(incCut)]
-
-library(maps)
-#png('CalHVal.png', width=4, height=5, units="in", res=720)
-map('state', 'california') 
-points(ca[,1:2], col=hvalCols, pch=20)
-legend("topright", title="Home Value",
-	legend=c("15k","120k","180k","265k","500k"),
-	fill=heat.colors(5), bty="n")
-#dev.off()
-
-#png('CalInc.png', width=4, height=5, units="in", res=720)
-map('state', 'california') 
-points(ca[,1:2], col=incCols, pch=20)
-legend("topright", title="Income",
-	legend=c("5k","26k","35k","47k","150k"),
-	fill=heat.colors(5), bty="n")
-#dev.off()
-
-######
-# Forests
+## Forests
 ca <- read.csv("CalCensus.csv")
 
 ## First, lets do it with CART
@@ -65,11 +17,6 @@ plot(catree, col="grey50")
 text(catree)
 #dev.off()
 
-## looks like the most complicated tree is best! 
-cvca <- cv.tree(catree)
-cvca$size[which.min(cvca$dev)]
-plot(cvca)
-
 ## Next, with random forest 
 ## limit the number of trees and the minimum tree size for speed
 ## also run on 4 cores if you've got them
@@ -99,29 +46,5 @@ mtext("forest", line=1)
 legend("topright", title="residuals", bty="n", pch=1, 
 	pt.cex=c(2,1,1,2), col=c("black","black","red","red"), legend=c(2,1, -1,-2))
 #dev.off()
-
-## out of sample test run
-MSE <- list(CART=NULL, RF=NULL)
-splits <- sample(1:10, nrow(ca), replace=TRUE)
-for(i in 1:10){
-  train <- which(splits!=i)
-  test <- which(splits==i)
-
-  rt <- tree(log(medianHouseValue) ~ ., data=ca[train,], mindev=1e-4) 
-  yhat.rt <- predict(rt, newdata=ca[-train,])
-  MSE$CART <- c( MSE$CART, 
-  				var(log(ca$medianHouseValue)[-train] - yhat.rt))
-
-  rf <- ranger(log(medianHouseValue) ~ ., data=ca[train,], 
-          		num.tree=400, num.threads=4)
-  yhat.rf <- predict(rf, data=ca[-train,])$predictions
-  MSE$RF <- c( MSE$RF, var(log(ca$medianHouseValue)[-train] - yhat.rf) )
-
-  cat(i)
-} 
-# results
-lapply(MSE, mean)
-# plot
-#png('CalOOS.png', width=5, height=5, units="in", res=720)
-boxplot(as.data.frame(MSE), col="dodgerblue", xlab="", ylab="MSE")
+#########################################
 #dev.off()
diff --git a/7trees/cps.R b/7trees/cps.R
@@ -1,3 +1,9 @@
+######################################
+#### Trees and Forests
+######################################
+
+################## CPS Census Income Data
+
 library(hdm)
 # Current Population Survey in 2012
 # similar data used in mulligan+rubinstein (QJE: 2008)
@@ -59,6 +65,7 @@ wagetreecut <- prune.tree(wagetree, best=5)
 plot(wagetreecut, col="grey50")
 text(wagetreecut)
 #dev.off()
+#######################################
 
 ## random forest and variable importance
 library(ranger)
@@ -96,4 +103,5 @@ glm(ytil ~ dtil-1)
 
 # HTE  
 pefac <- cut(cps$pexp, c(0,10,30,Inf),right=FALSE)
-coef(glm(ytil ~ dtil*pefac - pefac - dtil, data=cps))
+head(pefac)
+coef(glm(ytil ~ dtil*pefac - pefac - dtil - 1, data=cps))
diff --git a/8factors/bigstocks.csv b/8factors/bigstocks.csv
@@ -0,0 +1,25 @@
+AAPL,830
+GOOG,649
+MSFT,568
+FB,493
+AMZN,470
+JNJ,361
+XOM,328
+JPM,324
+WMT,244
+WFC,261
+V,237
+BAC,238
+PG,236
+T,234
+GE,218
+ORCL,203
+CVX,202
+DIS,158
+BA,141
+AMGN,125
+CSCO,161
+IBM,133
+PFE,198
+KO,197
+PEP,170
diff --git a/8factors/guinness.R b/8factors/guinness.R
diff --git a/8factors/guinness.jpg b/8factors/guinness.jpg
diff --git a/8factors/protein.R b/8factors/protein.R
@@ -1,5 +1,10 @@
+######################################
+#### Factors
+######################################
+
 ### *** European Protein Consumption, in grams/person-day *** ###
 food <- read.csv("protein.csv", row.names=1) # 1st column is country name
+head(food,4)
 
 # fit the factors
 pcfood <- prcomp(food, scale=TRUE)
@@ -28,6 +33,7 @@ plot(wfood[,3:4], type="n", xlim=c(-3,3),bty="n")
 text(x=wfood[,3], y=wfood[,4], labels=rownames(food), col=grpProtein$cluster)
 #dev.off()
 
+summary(pcfood)
 ## Scree plot
 #png('proteinScree.png', width=5, height=5, units="in", res=720)
 plot(pcfood, main="European Protein PCA")

diff --git a/8factors/returns.R b/8factors/returns.R
@@ -0,0 +1,123 @@
+######################################
+#### Factors
+######################################
+
+## analysis of monthly return 
+# (these are monthly returns minus the t-bill risk free rate of return)
+R <- read.csv("returns.csv", row.names=1)
+
+## pull out the S&P 500 and Amazon
+head(R[,1:4],3)
+sp500 <- R[,"SP500"]
+ind <- which(colnames(R)=="CVX")
+y <- R[,ind]
+R <- R[,-c(1,ind)]
+dim(R)
+
+## look at big tech
+round(R[1:3,c("AMZN","GOOG","AAPL","MSFT","FB")],4)
+R[27:30,"FB",drop=FALSE]
+
+## use a regularized EM algorithm from josse2016missmda
+library(missMDA)
+Ri <- imputePCA(R,npc=4)$completeObs
+round(Ri[1:3,c("AMZN","GOOG","AAPL","MSFT","FB")], 4)
+
+# use prcomp to fit the principal componets
+retpc <- prcomp(Ri, scale=TRUE)  
+w <- predict(retpc)
+phi <- retpc$rotation
+
+## plot big stock scores
+bigs <- read.csv("bigstocks.csv", header=FALSE)
+bigs <- bigs[-which(bigs[,1]=="CVX"),]
+#png('returnsPCA.png', width=10, height=5, units="in", res=720)
+par(mfrow=c(1,2),xpd=TRUE)
+plot(phi[bigs[,1],1:2], type="n", bty="n")
+text(phi[bigs[,1],1:2], labels=bigs[,1], cex=bigs[,2]/350, col="navy") 
+plot(phi[bigs[,1],3:4], type="n", bty="n")
+text(phi[bigs[,1],3:4], labels=bigs[,1], cex=bigs[,2]/350, col="navy") 
+#dev.off()
+
+#png('returnsZvSNP.png', width=4.5, height=4.5, units="in", res=720)
+plot(w[,1],sp500, bty="n", pch=20, xlab="PC1 monthly score", ylab="S&P500 montly return")
+#dev.off()
+
+## principal components regression
+fit <- glm(y ~ PC1 + PC2 + PC3 + PC4, data=as.data.frame(w))
+summary(fit)
+
+# onto the lasso
+library(gamlr)
+set.seed(0)
+alasso <- cv.gamlr(w, y, nfold=10)
+B <- coef(alasso)[-1,]
+B[B!=0]
+
+# both raw stocks and the PCs
+blasso <- cv.gamlr(cbind(w,Ri), y, foldid=alasso$foldid)
+B <- coef(blasso)[-1,]
+round(B[B!=0],5)
+
+#png('returnsPCReg.png', width=10, height=5, units="in", res=720)
+par(mfrow=c(1,2))
+plot(alasso, ylim=c(0,0.004), bty="n")
+mtext("PC Inputs",line=2, font=2, cex=1.1) 
+plot(blasso, ylim=c(0,0.004), bty="n")
+mtext("PC+Stocks Inputs",line=2, font=2, cex=1.1) 
+#dev.off()
+
+### marginal regression
+phi <- cor(Ri, y)/apply(Ri,2,sd) 
+v <- Ri%*%phi
+fwd <- glm(y ~ v)
+
+#png('returnsMRG1.png', width=4.5, height=4.5, units="in", res=720)
+plot(v, w[,1], bty="n", pch=20, xlab="MR factor", ylab="PC1")
+#dev.off()
+#png('returnsMRG2.png', width=4.5, height=4.5, units="in", res=720)
+plot(fwd$fitted, y,  pch=20, bty="n", xlab="MR fitted values")
+#dev.off()
+
+
+#### partial least squares
+library(textir)
+retpls <- pls(x=Ri, y=y,  K=3)
+
+#png('returnsPLS.png', width=10, height=4, units="in", res=720)
+par(mfrow=c(1,3), mai=c(.7,.7,.1,.1))
+plot(retpls, bty="n", cex.lab=1.4, pch=21, bg="yellow")
+#dev.off()
+
+## look at the values
+phi <- retpls$loadings*apply(Ri,2,sd)
+tail(phi[order(abs(phi[,1])),1])
+tail(phi[order(abs(phi[,2])),2])
+tail(phi[order(abs(phi[,3])),3])
+
+## CV experiment
+MSE <- matrix(nrow=10,ncol=5)
+for(i in 1:10){
+  train <- which(alasso$foldid!=i)
+  test <- which(alasso$foldid==i)
+  for(k in 1:ncol(MSE)){
+  	plsi <- pls(x=Ri[train,], y=y[train], K=k)
+  	MSE[i,k] <- mean( (y[test] - predict(plsi, Ri[test,]))^2 )
+  }
+  cat(i)
+}
+colMeans(MSE)
+
+MSE <- as.data.frame(MSE)
+names(MSE) <- paste(1:ncol(MSE))
+#png('returnsPLSOOS.png', width=4.5, height=4.5, units="in", res=720)
+boxplot(MSE, col="yellow", ylab="mean square error", xlab="K", ylim=c(0,0.004))
+abline(h=min(alasso$cvm), lty=2, col=2, lwd=1.5)
+abline(h=min(blasso$cvm), lty=2, col=4, lwd=1.5)
+#dev.off()
+
+#png('returnsPLS2D.png', width=4.5, height=4.5, units="in", res=720)
+par(xpd=TRUE)
+plot(retpls$loadings[bigs[,1],1:2], type="n", bty="n", xlab="PLS(1)", ylab="PLS(2)")
+text(retpls$loadings[bigs[,1],1:2], labels=bigs[,1], cex=bigs[,2]/350, col="navy") 
+#dev.off()