Name	Name	Last commit message	Last commit date
parent directory ..
KDD2009.Rdata	KDD2009.Rdata
README.html	README.html
README.md	README.md
gbm.R	gbm.R
orange_small_train.data.gz	orange_small_train.data.gz
orange_small_train_appetency.labels.txt	orange_small_train_appetency.labels.txt
orange_small_train_churn.labels.txt	orange_small_train_churn.labels.txt
orange_small_train_upselling.labels.txt	orange_small_train_upselling.labels.txt

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The book: "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014
The support site: GitHub WinVector/zmPDSwR

The code and data in this directory supports examples from:

Chapter 5: Choosing and Evaluating Models
Chapter 6: Using Memorization Methods

A workspace containing most of the results has been saved as KDD2009.Rdata and can be loaded in R with the command:

load('KDD2009.Rdata')

(note you will have to re-load various libraries like ROCR to perform some of the steps).

6-2-2013 Data from: http://www.sigkdd.org/kdd-cup-2009-customer-relationship-prediction Downloaded: $ shasum * e43a38e3477e38b354943519954b719ec7623c2f orange_small_train.data.zip 8274d23235630717659898900b7f74092ff339ad orange_small_train_appetency.labels.txt ec2de79844657fb892ec9047e6304c12b296ff68 orange_small_train_churn.labels.txt 4cd2d7c9b20fd3638883a91a2fed6a03a4d5d015 orange_small_train_upselling.labels.txt Data to support examples in the chapter on memorization methods in "Practical Data Science with R" ( http://www.manning.com/zumel/ ).

Load data:

  unzip orange_small_train.data.zip
  gzip -9 orange_small_train.data

To prepare data in R:

d <- read.table('orange_small_train.data.gz',
   header=T,sep='\t',na.strings=c('NA',''))
churn <- read.table('orange_small_train_churn.labels.txt',
   header=F,sep='\t')
d$churn <- churn$V1
appetency <- read.table('orange_small_train_appetency.labels.txt',
   header=F,sep='\t')
d$appetency <- appetency$V1
upselling <- read.table('orange_small_train_upselling.labels.txt',
   header=F,sep='\t')
d$upselling <- upselling$V1
set.seed(729375)
d$rgroup <- runif(dim(d)[[1]])
dTrainAll <- subset(d,rgroup<=0.9)
dTest <- subset(d,rgroup>0.9)
rm(list=c('d','churn','appetency','upselling'))
outcomes <- c('churn','appetency','upselling')
vars <- setdiff(colnames(dTrainAll),
   c(outcomes,'rgroup'))
numericVars <- vars[sapply(dTrainAll[,vars],class) %in% c('numeric','integer')]
catVars <- vars[sapply(dTrainAll[,vars],class) %in% c('factor','character')]
useForCal <- rbinom(n=dim(dTrainAll)[[1]],size=1,prob=0.1)>0
dCal <- subset(dTrainAll,useForCal)
dTrain <- subset(dTrainAll,!useForCal)
outcome <- 'churn'
pos <- '1'

example modeling:

mkPredC <- function(outCol,varCol,appCol) {
   pPos <- sum(outCol==pos)/length(outCol)
   naTab <- table(as.factor(outCol[is.na(varCol)]))
   pPosWna <- (naTab/sum(naTab))[pos]
   vTab <- table(as.factor(outCol),varCol)
   pPosWv <- (vTab[pos,]+1.0e-3*pPos)/(colSums(vTab)+1.0e-3)
   pred <- pPosWv[appCol]
   pred[is.na(appCol)] <- pPosWna
   pred[is.na(pred)] <- pPos
   pred
}

for(v in catVars) {
  pi <- paste('pred',v,sep='')
  dTrain[,pi] <- mkPredC(dTrain[,outcome],dTrain[,v],dTrain[,v])
  dTest[,pi] <- mkPredC(dTrain[,outcome],dTrain[,v],dTest[,v])
  dCal[,pi] <- mkPredC(dTrain[,outcome],dTrain[,v],dCal[,v])
}




mkPredNcat <- function(outCol,varCol,appCol) {
   cuts <- unique(as.numeric(quantile(varCol,probs=seq(0, 1, 0.1),na.rm=T)))
   varC <- cut(varCol,cuts)
   appC <- cut(appCol,cuts)
   mkPredC(outCol,varC,appC)
}




mkPredN <- mkPredNcat
#> print(paste('train',as.numeric([email protected])))
#[1] "train 0.733887878767371"
#> print(paste('cal',as.numeric([email protected])))
#[1] "cal 0.71792789642535"
#> print(paste('test',as.numeric([email protected])))
#[1] "test 0.726387559207025"


for(v in numericVars) {
  pi <- paste('pred',v,sep='')
  dTrain[,pi] <- mkPredN(dTrain[,outcome],dTrain[,v],dTrain[,v])
  dTest[,pi] <- mkPredN(dTrain[,outcome],dTrain[,v],dTest[,v])
  dCal[,pi] <- mkPredN(dTrain[,outcome],dTrain[,v],dCal[,v])
}

logLikelyhood <- function(outCol,predCol) {
  sum(ifelse(outCol==pos,log(predCol),log(1-predCol)))
}

entropy <- function(x) { 
  xpos <- x[x>0 & !is.na(x)]
  scaled <- xpos/sum(xpos)
  sum(-scaled*log(scaled,2))
}

nPos <- sum(dTrain[,outcome]==pos)
pPos <- nPos/length(dTrain[,outcome])
eps <- 1.0e-5
for(v in c(catVars,numericVars)) {
  pi <- paste('pred',v,sep='')
  li <- paste('lift',v,sep='')
  dTrain[,li] <- log((dTrain[,pi]+eps)/(pPos+eps))
  dTest[,li] <- log((dTest[,pi]+eps)/(pPos+eps))
  dCal[,li] <- log((dCal[,pi]+eps)/(pPos+eps))
}


library('ROCR')

selVars <- c()
minStep <- 5

baseRateTrain <- logLikelyhood(dTrain[,outcome],sum(dTrain[,outcome]==pos)/length(dTrain[,outcome]))
baseRateCheck <- logLikelyhood(dCal[,outcome],sum(dCal[,outcome]==pos)/length(dCal[,outcome]))
for(v in catVars) {
  pi <- paste('pred',v,sep='')
  li <- paste('lift',v,sep='')
  liTrain <- 2*(logLikelyhood(dTrain[,outcome],dTrain[,pi])- baseRateTrain - 2^entropy(table(dTrain[,pi],useNA='ifany')))
  liCheck <- 2*(logLikelyhood(dCal[,outcome],dCal[,pi]) - baseRateCheck - 2^entropy(table(dCal[,pi],useNA='ifany')))
  if((liTrain>=minStep)&(liCheck>minStep)) {
     print(sprintf("%s, trainAIC: %g calibrationAIC: %g",
        pi,liTrain,liCheck))
    selVars <- c(selVars,li)
  }
}
print(selVars)
for(v in numericVars) {
  pi <- paste('pred',v,sep='')
  li <- paste('lift',v,sep='')
  liTrain <- 2*(logLikelyhood(dTrain[,outcome],dTrain[,pi])-baseRateTrain-1)
  liCheck <- 2*(logLikelyhood(dCal[,outcome],dCal[,pi])-baseRateCheck-1)
  if((liTrain>=minStep) && (liCheck>=minStep)) {
     print(sprintf("%s, trainAIC: %g calibrationAIC: %g",
        pi,liTrain,liCheck))
     selVars <- c(selVars,li)
  }
}
print(selVars)


f <- paste(outcome,'>0 ~ ',paste(selVars,collapse=' + '),sep='')
# note, actually the glm() just wants the conditionals not the joints as inputs
# but that is just a matter of dividing by pPos, so it can compensate in the linear
# coefficient
model <- glm(as.formula(f),data=dTrain,family=binomial(link='logit'))
print(summary(model))
# could use stepwise regression at this point
dTrain$pred <- predict(model,type='response',newdata=dTrain)
dCal$pred <- predict(model,type='response',newdata=dCal)
dTest$pred <- predict(model,type='response',newdata=dTest)


perfTrain <- performance(prediction(dTrain$pred,dTrain[,outcome]>0),'auc')
perfCal <- performance(prediction(dCal$pred,dCal[,outcome]>0),'auc')
perfTest <- performance(prediction(dTest$pred,dTest[,outcome]>0),'auc')
print(paste('train',as.numeric(perfTrain@y.values)))
print(paste('cal',as.numeric(perfCal@y.values)))
print(paste('test',as.numeric(perfTest@y.values)))


modelRateTrain <- logLikelyhood(dTrain[,outcome],dTrain$pred)
modelRateCal <- logLikelyhood(dCal[,outcome],dCal$pred)
modelRateTest <- logLikelyhood(dTest[,outcome],dTest$pred)


library('ggplot2')

# show one of the "better" factors
ggplot(data=dTrain) + geom_density(aes(x=predVar218,color=as.factor(churn)))
ggplot(data=dTest) + geom_density(aes(x=predVar218,color=as.factor(churn)))

# show one of the "better" numeric predictions
ggplot(data=dTrain) + geom_density(aes(x=predVar126,color=as.factor(churn)))
ggplot(data=dTest) + geom_density(aes(x=predVar126,color=as.factor(churn)))
ggplot(data=dCal) + geom_density(aes(x=predVar126,color=as.factor(churn)))


# show the model
ggplot(data=dTrain) + geom_density(aes(x=pred,color=as.factor(churn)))
ggplot(data=dTest) + geom_density(aes(x=pred,color=as.factor(churn)))

# save(list=ls(),file='KDDSteps.RData')

calcAUC <- function(predcol,outcol) {
   perf <- performance(prediction(predcol,outcol==pos),'auc')
   as.numeric(perf@y.values)
}


library('rpart')
tmodel <- rpart(f,data=dTrain,control=rpart.control(cp=0.001))
dTrain$tpred <- predict(tmodel,newdata=dTrain)
dCal$tpred <- predict(tmodel,newdata=dCal)
dTest$tpred <- predict(tmodel,newdata=dTest)
calcAUC(dTrain$tpred,dTrain[,outcome])
# [1] 0.6936439
calcAUC(dTest$tpred,dTest[,outcome])
# [1] 0.6785368
calcAUC(dCal$tpred,dCal[,outcome])
# [1] 0.6780398




# AUCs by training
for(v in catVars) {
  pi <- paste('pred',v,sep='')
  aucTrain <- calcAUC(dTrain[,pi],dTrain[,outcome])
  if(aucTrain>=0.8) {
     aucCal <- calcAUC(dCal[,pi],dCal[,outcome])
     print(sprintf("%s, trainAUC: %4.3f calibrationAUC: %4.3f",
       pi,aucTrain,aucCal))
  }
}
# AUCs by calibration
for(v in catVars) {
  pi <- paste('pred',v,sep='')
  aucTrain <- calcAUC(dTrain[,pi],dTrain[,outcome])
  aucCal <- calcAUC(dCal[,pi],dCal[,outcome])
  if(aucCal>=0.59) {
     print(sprintf("%s, trainAUC: %4.3f calibrationAUC: %4.3f",
       pi,aucTrain,aucCal))
  }
}

for(v in numericVars) {
  pi <- paste('pred',v,sep='')
  aucTrain <- calcAUC(dTrain[,pi],dTrain[,outcome])
  if(aucTrain>=0.8) {
     aucCal <- calcAUC(dCal[,pi],dCal[,outcome])
     print(sprintf("%s, trainAUC: %4.3f calibrationAUC: %4.3f",
       pi,aucTrain,aucCal))
  }
}



library('randomForest')
fmodel <- randomForest(as.formula(f),data=dTrain,ntree=100,maxnodes=50,type='classification')
dTrain$fpred <- predict(fmodel,newdata=dTrain)
dTest$fpred <- predict(fmodel,newdata=dTest)
dCal$fpred <- predict(fmodel,newdata=dCal)
calcAUC(dTrain$fpred,dTrain[,outcome])
#[1] 0.7501219
calcAUC(dTest$fpred,dTest[,outcome])
# [1] 0.717073
calcAUC(dCal$fpred,dCal[,outcome])
# [1] 0.7109103



# better AUC estimate
var <- 'Var217'
aucs <- rep(0,100)
for(rep in 1:length(aucs)) {
   useForCalRep <- rbinom(n=dim(dTrainAll)[[1]],size=1,prob=0.1)>0
   predRep <- mkPredC(dTrainAll[!useForCalRep,outcome],
      dTrainAll[!useForCalRep,var],
      dTrainAll[useForCalRep,var])
   aucs[rep] <- calcAUC(predRep,dTrainAll[useForCalRep,outcome])
}
mean(aucs)
# [1] 0.5556656
#print(aucs)
#ggplot() + geom_density(aes(x=aucs))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

KDD2009

KDD2009

README.md

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from:

Files

KDD2009

Directory actions

More options

Directory actions

More options

Latest commit

History

KDD2009

Folders and files

parent directory

README.md

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from: