.Rhistory

# avg_rating_of_driver   0.014333
# avg_surge              0.011648
# phoneOther             0.001099
# surge_or_notsurged     0.000000
# used_or_not_30daysused 0.000000
# weekday_bucketzero     0.000000
# weekday_bucketsoso     0.000000
plot(importance)
plot(importance)
# prepare training scheme
control <- trainControl(method="cv", number=5)
# train the model
set.seed(1)
model <- train(churn~., data=df_train, method="xgbTree",  trControl=control,  importance = TRUE)
# Here only used training dataset is because to get an unbiased performance estimate,
# It's better to not use the whole dataset which including test dataset for feature selection.
importance <- varImp(model, scale=FALSE)
print(importances)
##### function.1
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
##### function 2.
see_distribution <- function(){
require(ggplot2)
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p2 <- df %>%
ggplot(aes(avg_dist, fill = churn)) +
geom_density() +
theme(legend.position = "none")
p3 <- df %>%
ggplot(aes(avg_rating_by_driver, fill = churn)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,200000))
p4 <- df %>%
ggplot(aes(avg_rating_of_driver, fill = churn)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,200000))
p5 <- df %>%
ggplot(aes(avg_surge, fill = churn)) +
geom_density() +
theme(legend.position = "none")
df <- df %>% mutate(surge_or_not = ifelse(avg_surge==1,"not_surged","surged"))
df %>%
ggplot(aes(surge_or_not, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p6 <- df %>%
ggplot(aes(surge_pct, fill = churn)) +
geom_density() +
theme(legend.position = "none")
p7 <- df %>%
ggplot(aes(trips_in_first_30_days, fill = churn)) +
geom_density(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none")
df <- df %>% mutate(used_or_not = ifelse(trips_in_first_30_days==0,"not_used","used"))
df %>% ggplot(aes(used_or_not, fill = churn)) +
geom_density(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none")
p8 <- df %>%
ggplot(aes(luxury_car_user, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p9 <- df %>%
ggplot(aes(weekday_pct, fill = churn)) +
geom_density() +
theme(legend.position = "none")
df <- df %>% mutate(weekday_bucket = case_when(
weekday_pct == 0 ~ "zero",
(weekday_pct > 0) & (weekday_pct < 100) ~ "soso",
weekday_pct ==100 ~ "everyday"
))
ggplot(df, aes(x = factor(weekday_bucket),fill=churn)) +
geom_bar(aes(y = (..count..)/sum(..count..)))
p10 <- df %>%
ggplot(aes(city_value, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") +
facet_wrap(~churn)
p11 <- df %>%
ggplot(aes(phone, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") +
facet_wrap(~churn)
# One of the key points of machine learning for classification is to see the distribution of each feature.
# Based on experience, the more feature's distribution apart from each other, more easier to classify the label right.
# Let me remove categorical features and plot distribution only on one column and see (Because the code part is simply change geom_bar to    # geom_density, so here will not include the code instead just results)
final <-  multiplot(p2,p3,p4,p5,p6,p7,p9,cols=1)
return(final)
}
#####
df <- read.csv(file="churn.csv", header=TRUE, sep=",")
# gather city and phone type from wide format to long format
df <- df %>%
mutate(city = case_when(
(city_Astapor == 1) ~ "Astapor",
(city_King.s.Landing == 1)  ~ "King",
(city_Winterfell == 1) ~ "Winterfell")) %>%
mutate(phone = case_when(
(phone_Android == 1)  ~ "Android",
(phone_iPhone == 1) ~ "iPhone",
(phone_no_phone == 1) ~ "Other")) %>%
mutate(churn=ifelse(churn==1,"churned","not_churned"))
## summary datasets to get a sense of data distribution.
summary(df)
## check data type
str(df)
## check missing value
sum(is.na(df))
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
install.packages(c("ggplot","dplyr","tidyr","tidyverse","plotly","caret","corrplot","pROC","mlbench"))
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(plotly)
library(caret)
library(corrplot)
library(pROC)
library(mlbench)
options(scipen = 999, stringsAsFactors=FALSE)   # Avoid automatic scientific notation of numbers
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(plotly)
library(caret)
library(corrplot)
library(pROC)
library(mlbench)
options(scipen = 999, stringsAsFactors=FALSE)   # Avoid automatic scientific notation of numbers
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p1
df <- read.csv(file="churn.csv", header=TRUE, sep=",")
# gather city and phone type from wide format to long format
df <- df %>%
mutate(city = case_when(
(city_Astapor == 1) ~ "Astapor",
(city_King.s.Landing == 1)  ~ "King",
(city_Winterfell == 1) ~ "Winterfell")) %>%
mutate(phone = case_when(
(phone_Android == 1)  ~ "Android",
(phone_iPhone == 1) ~ "iPhone",
(phone_no_phone == 1) ~ "Other")) %>%
mutate(churn=ifelse(churn==1,"churned","not_churned"))
df <- subset(df, select = -c(city_Astapor:phone_no_phone))
head(df)
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p1
summary(df$churn)
summary(df$churn==1)
summary(df$churn=="churn")
View(df)
df <- read.csv(file="churn.csv", header=TRUE, sep=",")
# gather city and phone type from wide format to long format
df <- df %>%
mutate(city = case_when(
(city_Astapor == 1) ~ "Astapor",
(city_King.s.Landing == 1)  ~ "King",
(city_Winterfell == 1) ~ "Winterfell")) %>%
mutate(phone = case_when(
(phone_Android == 1)  ~ "Android",
(phone_iPhone == 1) ~ "iPhone",
(phone_no_phone == 1) ~ "Other")) %>%
mutate(churn=ifelse(churn==1,"churned","not_churned"))
df <- subset(df, select = -c(city_Astapor:phone_no_phone))
head(df)
## summary datasets to get a sense of data distribution.
summary(df)
## check data type
str(df)
## check missing value
sum(is.na(df))
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p1
p2 <- df %>%
ggplot(aes(avg_dist, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p2
p3 <- df %>%
ggplot(aes(avg_rating_by_driver, fill = churn)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,30000)) + facet_grid(~churn)
p3
p4 <- df %>%
ggplot(aes(avg_rating_of_driver, fill = churn)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,30000)) + facet_grid(~churn)
p4
multiplot(p3,p4,cols=2)
p4 <- df %>%
ggplot(aes(avg_rating_of_driver, fill = churn)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,30000)) + facet_grid(~churn)
p4
multiplot(p3,p4,cols=2)
p5 <- df %>%
ggplot(aes(avg_surge, fill = churn)) +
geom_bar() +
theme(legend.position = "none") + facet_grid(~churn)
p5
df <- df %>% mutate(surge_or_not = ifelse(avg_surge==1,"not_surged","surged"))
df %>%
ggplot(aes(surge_or_not, fill = surge_or_not)) +
geom_bar() +
theme(legend.position = "none") + facet_grid(~churn)
p6 <- df %>%
ggplot(aes(surge_pct, fill = churn)) +
geom_bar() +
theme(legend.position = "none") + facet_grid(~churn)
p6
p7 <- df %>%
ggplot(aes(trips_in_first_30_days, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") + facet_grid(~churn)
p7
df <- df %>% mutate(used_or_not_30days = ifelse(trips_in_first_30_days==0,"not_used","used"))
df %>% ggplot(aes(used_or_not_30days, fill = used_or_not_30days)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") + facet_grid(~churn)
p8 <- df %>%
ggplot(aes(luxury_car_user, fill = as.factor(luxury_car_user))) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") + facet_grid(~churn)
p8
p9 <- df %>%
ggplot(aes(weekday_pct, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") + facet_grid(~churn)
p9
df <- df %>% mutate(weekday_bucket = case_when(
weekday_pct == 0 ~ "zero",
(weekday_pct > 0) & (weekday_pct < 100) ~ "soso",
weekday_pct ==100 ~ "everyday"
))
ggplot(df, aes(x = factor(weekday_bucket),fill=churn)) +
geom_bar(aes(y = (..count..)/sum(..count..)))
p10 <- df %>%
ggplot(aes(city, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none")  + facet_grid(~churn)
p10
p11 <- df %>%
ggplot(aes(phone, fill = churn)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
theme(legend.position = "none") + facet_grid(~churn)
p11
see_distribution()
num <- c("avg_dist","avg_rating_by_driver","avg_rating_of_driver","avg_surge","surge_pct","trips_in_first_30_days","weekday_pct")
df_num <- select(df, one_of(num))
corrplot(cor(df_num, use="complete.obs"),type="lower")
require(caret)
## formating some features
names(df)
# [1] "avg_dist"               "avg_rating_by_driver"   "avg_rating_of_driver"
# [4] "avg_surge"              "surge_pct"              "trips_in_first_30_days"
# [7] "luxury_car_user"        "weekday_pct"            "churn"
# [10] "city"                   "phone"                  "surge_or_not"
# [13] "used_or_not_30days"      "weekday_bucket"
cols <- c("luxury_car_user", "city", "phone", "surge_or_not","used_or_not_30days" , "weekday_bucket" ,"churn")
df[cols] <- lapply(df[cols], factor)
cols_num <- c("avg_dist", "avg_rating_by_driver", "avg_rating_of_driver", "avg_surge","surge_pct" , "trips_in_first_30_days" ,"weekday_pct")
df[cols_num] <- lapply(df[cols_num], as.numeric)
# Move churn to the last column, otherwise hard to see
col_idx <- grep("churn", names(df))
df <- df[, c( (1:ncol(df))[-col_idx], col_idx)]
require(caret)
## formating some features
names(df)
# [1] "avg_dist"               "avg_rating_by_driver"   "avg_rating_of_driver"
# [4] "avg_surge"              "surge_pct"              "trips_in_first_30_days"
# [7] "luxury_car_user"        "weekday_pct"            "churn"
# [10] "city"                   "phone"                  "surge_or_not"
# [13] "used_or_not_30days"      "weekday_bucket"
cols <- c("luxury_car_user", "city", "phone", "surge_or_not","used_or_not_30days" , "weekday_bucket" ,"churn")
df[cols] <- lapply(df[cols], factor)
cols_num <- c("avg_dist", "avg_rating_by_driver", "avg_rating_of_driver", "avg_surge","surge_pct" , "trips_in_first_30_days" ,"weekday_pct")
df[cols_num] <- lapply(df[cols_num], as.numeric)
# Move churn to the last column, otherwise hard to see
col_idx <- grep("churn", names(df))
df <- df[, c( (1:ncol(df))[-col_idx], col_idx)]
trainIndex <- createDataPartition(df$churn, p = 0.8,
list = FALSE,
times = 1)
head(trainIndex)
df_train <- df[ trainIndex,]
df_test  <- df[-trainIndex,]
control <- trainControl(method="cv", classProbs = TRUE, number=5, summaryFunction=twoClassSummary)
seed <- 1
metric <- "ROC"
set.seed(seed)
# mtry <- sqrt(ncol(df))
# tunegrid <- expand.grid(.mtry=mtry)
# gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9),
#                         n.trees = (1:30)*50,
#                         shrinkage = 0.1,
#                         n.minobsinnode = 20)
gbm_default <- train(churn~., data=df_train, method="gbm", metric=metric, trControl=control)
print(gbm_default)
# Stochastic Gradient Boosting
#
# 40001 samples
#    13 predictor
#     2 classes: 'churned', 'not_churned'
#
# No pre-processing
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 32001, 32000, 32001, 32000, 32002
# Resampling results across tuning parameters:
#
#   interaction.depth  n.trees  ROC        Sens       Spec
#   1                   50      0.8226772  0.8872459  0.5602904
#   1                  100      0.8342703  0.8770284  0.5993089
#   1                  150      0.8391937  0.8704170  0.6213777
#   2                   50      0.8385264  0.8703769  0.6169236
#   2                  100      0.8459340  0.8583562  0.6504257
#   2                  150      0.8487931  0.8589974  0.6576714
#   3                   50      0.8438442  0.8636852  0.6360681
#   3                  100      0.8488437  0.8597987  0.6571393
#   3                  150      0.8510637  0.8616018  0.6601305
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
# Tuning parameter
# 'n.minobsinnode' was held constant at a value of 10
# ROC was used to select the optimal model using  the largest value.
# The final values used for the model were n.trees = 150, interaction.depth = 3, shrinkage = 0.1
# and n.minobsinnode = 10.
plot(gbm_default)
set.seed(1)
pt_gbm <- predict(gbm_default, newdata = df_test)
confusionMatrix(pt_gbm, df_test$churn)
set.seed(1)
gbm_probs <- predict(gbm_default, newdata = df_test, type = "prob")
gbm_ROC <- roc(predictor=gbm_probs$churned,
response=df_test$churn,
levels=rev(levels(df_test$churn)))
gbm_ROC$auc
plot(gbm_ROC,main="GBM ROC")
hist1 <- histogram(~gbm_probs$churned|df_test$churn,xlab="Probability of Churn, GBM",ylim = c(0,30))
hist1
control <- trainControl(method="cv", classProbs = TRUE, number=5, summaryFunction=twoClassSummary)
seed <- 1
metric <- "ROC"
set.seed(seed)
# mtry <- sqrt(ncol(df))
# tunegrid <- expand.grid(.mtry=mtry)
xgb_default <- train(churn~., data=df_train, method="xgbTree", metric=metric, trControl=control)
print(xgb_default)
plot(xgb_default)
pt_xgb <- predict(xgb_default, newdata = df_test)
confusionMatrix(pt_xgb, df_test$churn)
set.seed(1)
xgb_probs <- predict(xgb_default, newdata = df_test, type = "prob")
xgb_ROC <- roc(predictor=xgb_probs$churned,
response=df_test$churn,
levels=rev(levels(df_test$churn)))
xgb_ROC$auc
plot(xgb_ROC,main="XGBBOOST ROC")
hist2 <- histogram(~xgb_probs$churned|df_test$churn,xlab="Probability of Churn, XGBOOST",ylim = c(0,30))
hist2
set.seed(1)
resamps <- resamples(list(GBM = gbm_default,
XGBBOOST = xgb_default))
summary(resamps)
trellis.par.set(caretTheme())
dotplot(resamps, metric = "ROC")
# prepare training scheme
control <- trainControl(method="cv", number=5)
# train the model
set.seed(1)
model <- train(churn~., data=df_train, method="xgbTree",  trControl=control,  importance = TRUE)
# Here only used training dataset is because to get an unbiased performance estimate,
# It's better to not use the whole dataset which including test dataset for feature selection.
importances <- varImp(model, scale=FALSE)
print(importances)
# xgbTree variable importance
#
# Overall
# avg_rating_by_driver   0.240435
# surge_pct              0.158038
# cityKing               0.156737
# weekday_pct            0.156282
# phoneiPhone            0.081768
# trips_in_first_30_days 0.067793
# luxury_car_user1       0.066480
# avg_dist               0.026917
# cityWinterfell         0.018469
# avg_rating_of_driver   0.014333
# avg_surge              0.011648
# phoneOther             0.001099
# surge_or_notsurged     0.000000
# used_or_not_30daysused 0.000000
# weekday_bucketzero     0.000000
# weekday_bucketsoso     0.000000
print(importances)
plot(importances)
plot(results, type=c("g", "o"))
control <- trainControl(method="cv", classProbs = TRUE, number=5, summaryFunction=twoClassSummary)
seed <- 1
metric <- "ROC"
set.seed(seed)
xgb_select <- train(churn~ city + phone + avg_rating_by_driver + weekday_bucket + luxury_car_user + surge_pct, data=df_train, method="xgbTree", metric=metric, trControl=control)
print(xgb_select)
# ROC is about 0.81
pt_select <- predict(xgb_select, newdata = df_test)
confusionMatrix(pt_select, df_test$churn)
xgb_probs <- predict(xgb_select, newdata = df_test, type = "prob")
xgb_ROC <- roc(predictor=xgb_probs$churned,
response=df_test$churn,
levels=rev(levels(df_test$churn)))
xgb_ROC$auc
hist3 <- histogram(~xgb_probs$churned|df_test$churn,xlab="Probability of Churn by selected feature, XGBOOST",ylim = c(0,35))
hist3
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(plotly)
library(caret)
library(corrplot)
library(pROC)
library(mlbench)
options(scipen = 999, stringsAsFactors=FALSE)   # Avoid automatic scientific notation of numbers
p1 <-  df %>%
ggplot(aes(churn, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p1
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(plotly)
library(caret)
library(corrplot)
library(pROC)
library(mlbench)
options(scipen = 999, stringsAsFactors=FALSE)   # Avoid automatic scientific notation of numbers
p2 <- df %>%
ggplot(aes(avg_dist, fill = churn)) +
geom_bar() +
theme(legend.position = "none")
p2
install.packages("rmarkdown")
render("churn.Rmd")
library()
installed.packages("tidyverse")
install.packages("tidyverse")
install.packages("tidyverse")
library("rmarkdown")
render("churn.Rmd")
markdown::render("churn.Rmd")
rmarkdown::render("churn.Rmd")
install.packages("tidyverse")
install.packages("tidyverse")
install.packages("rmarkdown")
install.packages("highr")
install.packages("rmarkdown")