-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathlogit_titanic.R
436 lines (357 loc) · 16.4 KB
/
logit_titanic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
library(stats)
library(ggplot2)
library(dplyr)
library(stringr)
library("MASS")
library("car")
library(caTools)
library(e1071)
library(caret)
library(readr)
library(kernlab)
#************************************ Rough Code Draft***************************************#
##Note: Performance Linear Regression to predict Age=0.75119
##Note: Performance Missing Age replaced with median/mean=0.75598
##Trying SVM model in additional script file to check for model improvement.
survival_db<- read.csv("train.csv", stringsAsFactors = FALSE, na.strings = c("", " ", "na", "NA"))
test_db<- read.csv("test.csv", stringsAsFactors = FALSE, na.strings = c("", " ", "na", "NA"))
sapply(survival_db, function(x) sum(is.na(x)))
#Missing Values in input DB
#Age= 177
#Cabin= 687[Extract Deck]
#Embarked= 2*
sapply(test_db, function(x) sum(is.na(x)))
#Cabin=327[deleted]
#Fare=1
#Age=86
#Lets Round off Age to one digit after the decimal space
survival_db$Age<- round(survival_db$Age, 1)
survival_db$Embarked[which(is.na(survival_db$Embarked))]<- "S"
#62 and 830 have missing values for Embarked Filled with "S"
dump1<-filter(survival_db, Pclass==1 & Fare>=80 & SibSp==0 & Parch==0 & Fare<=90 )
ggplot(survival_db, aes(x=as.factor(survival_db$Embarked)))+geom_bar()
boxplot(survival_db$Fare~survival_db$Embarked)
#Extracting Ticket Number from the Ticket column
dump2<-data.frame(str_split(survival_db$Ticket, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<- data.frame(t(dump2))
rownames(dump2)<- 1:nrow(survival_db)
colnames(dump2)[2]<- "Ticket_Number"
survival_db<-cbind(survival_db, dump2$Ticket_Number)
rownames(survival_db)<- 1:nrow(survival_db)
colnames(survival_db)[13]<- "Ticket_Number"
survival_db$Ticket_Number<- as.character(survival_db$Ticket_Number)
survival_db$Ticket_Number<-str_replace(survival_db$Ticket_Number, pattern = "2. ", replacement = "")
survival_db$Ticket_Number[474]<-"541"
survival_db$Ticket_Number<- str_replace(survival_db$Ticket_Number, pattern = "LINE", replacement = "1466287")
survival_db$Ticket_Number<- as.numeric(as.character(survival_db$Ticket_Number))
#Rounding off Fare to one digit
survival_db$Fare<- round(survival_db$Fare, 1)
#Creating the Deck Column
survival_db$Deck<- str_sub(survival_db$Cabin, start = 1L, end = 1L)
survival_db$Deck[which(is.na(survival_db$Deck)==T)]<- "Unknown"
#Extracting Name Title
dump1<- data.frame(str_split(survival_db$Name, pattern = ", ", n=2), stringsAsFactors = FALSE)
dump1<-data.frame(t(dump1))
dump2<- data.frame(str_split(dump1$X2, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<-data.frame(t(dump2))
rownames(dump2)<- 1:nrow(dump2)
dump2$X1<- as.character(dump2$X1)
dump2$X1[which(dump2$X1=="the")]<- "Countess."
survival_db<- cbind(survival_db, dump2$X1)
colnames(survival_db)[15]<- "Title"
survival_db$Title<- as.character(survival_db$Title)
#Removing un-necessary columns from the dataset
survival_db<- survival_db[,-c(4,9,11)]
rm(dump1)
rm(dump2)
#Computing the total number of family members
survival_db$Family<- survival_db$SibSp + survival_db$Parch
#Regularizing the Title
unique(survival_db$Title)
men<- c("Mr.", "Don.", "Rev.","Major.","Sir.","Col.", "Capt.", "Jonkheer.")
girls<- c("Ms." , "Mlle.")
women<- c("Mrs.", "Mme.","Lady.", "Countess.", "Dona.")
nam_title_reg<- function(inp_str, gender)
{
if(inp_str %in% men)
return("Mr.")
else if(inp_str %in% women)
return("Mrs.")
else if(inp_str %in% girls)
return("Miss.")
else if(inp_str=="Dr." & gender=="male")
return("Mr.")
else if(inp_str=="Dr." & gender=="female")
return("Mrs.")
else return(inp_str)
}
for(i in 1:nrow(survival_db))
{
survival_db$Title[i]<- nam_title_reg(survival_db$Title[i], survival_db$Sex[i])
}
#Creating a Ratio Metric
survival_db$Ratio<- survival_db$Fare/(survival_db$Family+1)
survival_db$Ratio<- round(survival_db$Ratio, 1)
#Building a LM model to derive Missing Age values
#Trying Correlation plots
dabba<- survival_db[,c(1,3,5,6,7,8,10,13,14)]
dabba<- na.omit(dabba)
cor(dabba)
#Trying an LinReg to predict age.
set.seed(85)
jix<- sample(1:nrow(dabba), 0.8*nrow(dabba))
zax_trng<- dabba[jix,]
zwx_test<- dabba[-jix,]
names(zax_trng)
mod_1<-lm(Age~., data=zax_trng)
summary(mod_1)
mod_2<- stepAIC(mod_1, direction = "both")
summary(mod_2)
vif(mod_2)
#Arrived at the final model Adjusted R-squared: 0.2144
pred_age<- predict(mod_2, zwx_test)
zwx_test<- cbind(zwx_test, pred_age)
zwx_test$pred_age<- round(zwx_test$pred_age, 1)
cor(zwx_test$Age, zwx_test$pred_age)
#0.507 so that's a decent model
#lm(formula = Age ~ Pclass + SibSp + Parch + Ratio, data = zax_trng)
Estimate_age<- predict(mod_2, survival_db)
survival_db<-cbind(survival_db, Estimate_age)
survival_db$Estimate_age[which(survival_db$Estimate_age<=0)]<- median(survival_db$Age, na.rm = TRUE)
survival_db$Estimate_age<- round(survival_db$Estimate_age, 1)
#Imputing Missing Age values
for(i in 1:nrow(survival_db))
{
if(is.na(survival_db$Age[i])==TRUE)
{
survival_db$Age[i]<- survival_db$Estimate_age[i]
}
}
survival_db<- survival_db[,-15]
#Defining a Measure Age*Pclass
survival_db$Measure<- survival_db$Age*survival_db$Pclass
rm(dabba)
rm(zax_trng)
rm(zwx_test)
#************************************ Understanding the Data ***************************************#
#Structure of the dataset
str(survival_db)
#Checking the number of distinct variables in the dataset
length(unique(survival_db$PassengerId))
unique(survival_db$Survived)
#Converting Survived to a factor variable
survival_db$Survived<- factor(survival_db$Survived)
unique(survival_db$Pclass)
unique(survival_db$Sex)
length(unique(survival_db$SibSp))
length(unique(survival_db$Parch))
unique(survival_db$Embarked)
length(unique(survival_db$Deck))
unique(survival_db$Title)
length(unique(survival_db$Family))
names(survival_db)
#Transfering all factor variables to one table and creating dummies
cat_tab<- survival_db[,c(3,4,9,11,12)]
cat_tab<- data.frame(sapply(cat_tab, factor))
str(cat_tab)
cat_tab<- data.frame(model.matrix(~., data = cat_tab))
cat_tab<- cat_tab[,-1]
#Deleting Categorical Columns
survival_db<- survival_db[,-c(3,4,9,11,12)]
#Scaling the numeric variables
num_tab<- survival_db[,-2]
num_tab<- data.frame(sapply(num_tab, scale))
survival_db<- data.frame(survival_db[,-c(1,3:10)])
#Combining the dataframes
survival_db<- cbind(survival_db, num_tab)
survival_db<- cbind(survival_db, cat_tab)
colnames(survival_db)[1]<- "Survived"
#Splitting the survival_db into test and training dataset
set.seed(85)
index<- sample.split(survival_db$Survived, SplitRatio = 0.8)
trng_data<- survival_db[index,]
test_data<- survival_db[!index,]
names(trng_data)
#************************************ Logit Model Building ***************************************#
logit_mod_1<- glm(Survived~., trng_data, family = "binomial")
summary(logit_mod_1)
logit_mod_2<- stepAIC(logit_mod_1, direction = "both")
summary(logit_mod_2)
vif(logit_mod_2)
#We will now begin elimination based on p-value and vif
#Since all vif values are within range and less than 3 we will eliminate based on p-value
#We will remove TitleMrs. from the next model as p-value=0.1115
logit_mod_3<- glm(formula = Survived ~ Age + SibSp + Parch + Ticket_Number +
Pclass2 + Pclass3 + EmbarkedS + DeckB + DeckD + DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_3)
vif(logit_mod_3)
#We will remove Ticket Number from the next model as p-value=0.121622
logit_mod_4<- glm(formula = Survived ~ Age + SibSp + Parch + Pclass2 + Pclass3 + EmbarkedS + DeckB
+ DeckD + DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_4)
vif(logit_mod_4)
#We will remove EmbarkedS from the next model as p-value=0.120394
logit_mod_5<- glm(formula = Survived ~ Age + SibSp + Parch + Pclass2 + Pclass3 + DeckB
+ DeckD + DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_5)
vif(logit_mod_5)
#We will remove Parch from the next model as p-value=0.052237
logit_mod_6<- glm(formula = Survived ~ Age + SibSp + Pclass2 + Pclass3 + DeckB
+ DeckD + DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_6)
vif(logit_mod_6)
#logit_mod_6 is a contender for the final model let us remove a few more varibles followed by cross validation for all models
#We will remove DeckB from the next model as p-value=0.033777
logit_mod_7<- glm(formula = Survived ~ Age + SibSp + Pclass2 + Pclass3 +
DeckD + DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_7)
vif(logit_mod_7)
#We will remove DeckD from the next model as p-value=0.055407
logit_mod_8<- glm(formula = Survived ~ Age + SibSp + Pclass2 + Pclass3 +
DeckE + TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_8)
vif(logit_mod_8)
#We will remove DeckE from the next model as p-value=0.059870
logit_mod_9<- glm(formula = Survived ~ Age + SibSp + Pclass2 + Pclass3 +
TitleMr., family = "binomial", data = trng_data)
summary(logit_mod_9)
vif(logit_mod_9)
#logit_mod_9 is the optimal reduced model with 5 predictor variables all of which are highly significant for predicting the Survival of the passengers
#************************************ Patrameter Optimization ***************************************#
#Assigning the reduced logit model to final_model for parameter optimization
final_model<- logit_mod_9
#Generating predicted survival probability for the segmented test data
survival_prob<- predict(final_model, type = "response", test_data[,-1])
survival_prob<- as.numeric(survival_prob)
str(survival_prob)
summary(survival_prob)
#Storing the Actual Survived labels of segmented test data into new vector- test_actual_survival
test_actual_survival<- factor(ifelse(test_data$Survived==1, "Yes","No"))
test_actual_survival
#Now we will define a custom parameter optimization function to find the optimal cutoff_probability
parameter_optimization <- function(cutoff_prob)
{
predicted_survival <- factor(ifelse(survival_prob >= cutoff_prob, "Yes", "No"))
conf_mat <- confusionMatrix(predicted_survival, test_actual_survival, positive = "Yes")
accuracy_val <- conf_mat$overall[1]
sensitivity_val <- conf_mat$byClass[1]
specificity_val <- conf_mat$byClass[2]
output <- t(as.matrix(c(sensitivity_val, specificity_val, accuracy_val)))
colnames(output) <- c("Sensitivity", "Specificity", "Accuracy")
return(output)
}
#We will create a sequence of 100 cutoff values between 0.01 and 0.80 to be sent as an input to the parameter_optimization function.
cutoff_vector = seq(.01,.80,length=100)
#We have created eval_matrix to store the sensitivity, specificity and accuracy values of each of the 200 input cutoff_vector values from the parameter_optimization function.
eval_mat = matrix(0,100,3)
#Evaluating the sensitivity, specificity and accuracy parameters for each test case.
for(sb in 1:100)
{
eval_mat[sb,] = parameter_optimization(cutoff_vector[sb])
}
#Plotting the parameter_values for each cutoff value to determine the optimal cutoff value.
plot(cutoff_vector, eval_mat[,1],xlab="Cutoff_Prob",ylab="Parameter_Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(cutoff_vector,eval_mat[,2],col="darkgreen",lwd=2)
lines(cutoff_vector,eval_mat[,3],col=4,lwd=2)
title(main = "Plot3. Optimization Parameters")
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
#The optimal cutoff value will be the cutoff value for which the difference between Sensitivity and Specificity is minimum.
opt_cutoff_mat <- cutoff_vector[which(abs(eval_mat[,1]-eval_mat[,2])==min(abs(eval_mat[,1]-eval_mat[,2])))]
opt_cutoff<- opt_cutoff_mat[1]
#Computing the Attrition status of the predicted test_predict vector by using the optimal cutoff value[opt_cutoff]
predicted_survival <- factor(ifelse(survival_prob >= opt_cutoff, "Yes", "No"))
conf_final <- confusionMatrix(predicted_survival, test_actual_survival, positive = "Yes")
opt_accuracy <- conf_final$overall[1]
opt_sensitivity <- conf_final$byClass[1]
opt_specificity <- conf_final$byClass[2]
#Storing the optimum parameters in a dataframe optimum_parameters
optimum_parameters<-data.frame(opt_accuracy, opt_sensitivity, opt_specificity, opt_cutoff, row.names = "[final_model]")
optimum_parameters
#Final Model Performance on test_subset. We will now prepare the test_db data and submit our prediction
# opt_accuracy opt_sensitivity opt_specificity opt_cutoff Model
#[final_model] 0.7808989 0.7794118 0.7818182 0.4409091 logit_mod_9
#************************************ Preparing Test Database ***************************************#
sapply(test_db, function(x) sum(is.na(x)))
#Cabin=327[deleted]
#Fare=1
#Age=86
#Rounding off Age to one decimal space.
test_db$Age<- round(test_db$Age, 1)
#There is one missing value for Fare in our Dataset. We will impute the missing value with the median
#This will not impact the performance as Fare is not a significant parameter to our logit model. But it is required to impute the records with missing age
summary(test_db$Fare)
which(is.na(test_db$Fare)==TRUE)
test_db[153,]
dump1<- filter(test_db, test_db$Pclass==3, test_db$Embarked=="S")
summary(dump1$Fare)
#Clearly the data is skewed. We will impute the missing fare with the median value
test_db$Fare[which(is.na(test_db$Fare)==TRUE)]<- median(test_db$Fare, na.rm = TRUE)
test_db$Fare<- round(test_db$Fare,1)
#Removing Irrelevant columns from the Test Data
test_db<- test_db[,-c(8,10,11)]
#Creating the Family Column
test_db$Family<- test_db$SibSp + test_db$Parch
#Creating the Ratio Column
test_db$Ratio<- test_db$Fare/(test_db$Family+1)
#Using the Linear Regression Model to Predict the age of the records with the missing Age
Estimate_age<- predict(mod_2, test_db)
test_db<- cbind(test_db, Estimate_age)
test_db$Estimate_age[which((test_db$Estimate_age<=0)==TRUE)]<- median(test_db$Age, na.rm = TRUE)
test_db$Estimate_age<- round(test_db$Estimate_age, 1)
#Imputing missing Age with the Estimate Age
for(i in 1:nrow(test_db))
{
if(is.na(test_db$Age[i])==TRUE)
{
test_db$Age[i]<- test_db$Estimate_age[i]
}
}
#Now we will perform the Name Title Based Extraction
dump1<- data.frame(str_split(test_db$Name, pattern = ", ", n=2), stringsAsFactors = FALSE)
dump1<-data.frame(t(dump1))
dump2<- data.frame(str_split(dump1$X2, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<-data.frame(t(dump2))
rownames(dump2)<- 1:nrow(dump2)
dump2$X1<- as.character(dump2$X1)
unique(dump2$X1)
which(dump2$X1=="Dona.")
dump2[415,]
dump1[415,]
test_db[415,]
#From Here we know that "Dona." is a potuguese title given to women. It is equivalent to Mrs.
#There we have added this title in the women vector
test_db<- cbind(test_db, dump2$X1)
colnames(test_db)[12]<- "Title"
test_db$Title<- as.character(test_db$Title)
#Regularizing the Name Titles
for(i in 1:nrow(test_db))
{
test_db$Title[i]<- nam_title_reg(test_db$Title[i], test_db$Sex[i])
}
unique(test_db$Title)
#We will now remove all columns that are irrelevant to the final logit model
test_db<- test_db[,-c(3,4,7,8,9,10,11)]
#Creating dummy variables for the categorical variables
cat_tab<- test_db[,c(2,5)]
cat_tab<- data.frame(sapply(cat_tab, factor))
cat_tab<- data.frame(model.matrix(~., cat_tab))
cat_tab<- cat_tab[,-1]
num_tab<- test_db[,c(3,4)]
num_tab<- data.frame(sapply(num_tab, scale))
test_db<- cbind(test_db$PassengerId,num_tab, cat_tab)
colnames(test_db)[1]<- "PassengerId"
#Dataset is prepared. We will now Predict the survival probability using the logit model optimized with the training model.
survival_prob<- predict(final_model, type = "response", test_db[,-1])
survival_prob<- as.numeric(survival_prob)
str(survival_prob)
summary(survival_prob)
#Now we will extract the predicted labels
predicted_survival <- factor(ifelse(survival_prob >= opt_cutoff, 1, 0))
test_db<-cbind(test_db, predicted_survival)
colnames(test_db)[9]<-"Survived"
#Now we will create a submission file
amw_logit_titanic<- test_db[,c(1,9)]
write.csv(amw_logit_titanic, "amw_logit_titanic_sub.csv", row.names = FALSE)