-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRegression.py
329 lines (223 loc) · 10.8 KB
/
Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 26 14:13:40 2019
@author: Srinivas Gannavarapu
"""
import pandas as pd
import sklearn as sk
import math
import numpy as np
from scipy import stats
import matplotlib as matlab
import statsmodels
###############LAB:Correlation Calculation########################
#Dataset: Air Travel Data\Air_travel.csv
#Importing Air passengers data
air = pd.read_csv("./Data/AirPassengers.csv")
# describing/inspecting data
print (air.shape)
print(air.columns.values)
print (air.head(10))
print (air.describe())
#Find the correlation between number of passengers and promotional budget.
np.corrcoef(air.Passengers,air.Promotion_Budget)
#Find the correlation between number of passengers and Intere metro flight ration.
np.corrcoef(air.Passengers,air.Inter_metro_flight_ratio)
#Find the correlation between number of passengers and Service Quality Score .
np.corrcoef(air.Passengers,air.Service_Quality_Score)
#Draw a scatter plot between number of passengers and promotional budget
matlab.pyplot.scatter(air.Passengers, air.Promotion_Budget)
#Find the correlation between number of passengers and Service_Quality_Score
np.corrcoef(air.Passengers,air.Service_Quality_Score)
##############################################Regression######################################
#Correlation between promotion and passengers count
np.corrcoef(air.Passengers,air.Promotion_Budget)
#Draw a scatter plot between Promotion_Budget and Passengers. Is there any any pattern between Promotion_Budget and Passengers?
matlab.pyplot.scatter(air.Promotion_Budget,air.Passengers)
#Build a linear regression model and estimate the expected passengers for a Promotion_Budget is 650,000
##Regression Model promotion and passengers count
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Promotion_Budget', data=air)
model
fitted1 = model.fit()
fitted1.summary()
# Passengers = 1259.6 + 0.0695 * PromotionBudget
print(1259.6 + (0.0695)*700000)
# Building another model for inter_metor_flight_ratio
import statsmodels.formula.api as sm2
model2 = sm2.ols(formula='Passengers ~ Inter_metro_flight_ratio', data=air)
model2
fitted2 = model2.fit()
fitted2.summary()
# Building multiple regression model
import statsmodels.formula.api as sm3
model3 = sm3.ols(formula='Passengers ~ Service_Quality_Score', data=air)
model3
fitted3 = model3.fit()
fitted3.summary()
#Building the same model using sci-kit learn
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(air[["Promotion_Budget"]], air[["Passengers"]])
#Coefficients
lr.coef_
lr.intercept_
#Build a regression line to predict the passengers using Inter_metro_flight_ratio
##Regression Model inter_metro_flight_ratio and passengers count
matlab.pyplot.scatter(air.Inter_metro_flight_ratio,air.Passengers)
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Inter_metro_flight_ratio', data=air)
fitted2 = model.fit()
fitted2.summary()
#Building the same model using sci-kit learn
#from sklearn.linear_model import LinearRegression
#lr = LinearRegression()
#lr.fit(air[["Inter_metro_flight_ratio"]], air[["Passengers"]])
#Coefficients
#lr.coef_
#lr.intercept_
#############################################################################
############ Lab:R Sqaure ##################
#What is the R-square value of Passengers vs Promotion_Budget model?
fitted1.summary()
#What is the R-square value of Passengers vs Inter_metro_flight_ratio
fitted2.summary()
################################################
#############Lab: Multiple Regerssion Model ####################
#Build a multiple regression model to predict the number of passengers
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Promotion_Budget+Service_Quality_Score+Inter_metro_flight_ratio', data=air)
fitted = model.fit()
fitted.summary()
#What is R-square value
fitted.summary()
#Are there any predictor variables that are not impacting the dependent variable
##Inter_metro_flight_ratio is dropped
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Promotion_Budget+Service_Quality_Score', data=air)
fitted = model.fit()
fitted.summary()
###############################################
##Adjusted R-Square
adj_sample=pd.read_csv("/Users/ibm/Downloads/Adj_Sample.csv")
#Build a model to predict y using x1,x2 and x3. Note down R-Square and Adj R-Square values
model = sm.ols(formula='Y ~ x1+x2+x3', data=adj_sample)
fitted = model.fit()
fitted.summary()
#R-Squared
#Model2
model = sm.ols(formula='Y ~ x1+x2+x3+x4+x5+x6', data=adj_sample)
fitted = model.fit()
fitted.summary()
#Model3
model = sm.ols(formula='Y ~ x1+x2+x3+x4+x5+x6+x7+x8', data=adj_sample)
fitted = model.fit()
fitted.summary()
#################################################################################3
#####Multiple Regression- issues
#Import Final Exam Score data
final_exam=pd.read_csv("/Users/ibm/Downloads/Final Exam Score.csv")
#Size of the data
print(final_exam.shape)
#Variable names
print(final_exam.columns)
#First few observations
print(final_exam.head(10))
#Build a model to predict final score using the rest of the variables.
import statsmodels.formula.api as sm_exam
model_exam = sm_exam.ols(formula='Final_exam_marks ~ Sem1_Science+Sem2_Science+Sem2_Math', data=final_exam)
fitted_exam = model_exam.fit()
print(fitted_exam.summary())
print(fitted_exam.rsquared)
#How are Sem2_Math & Final score related? As Sem2_Math score increases, what happens to Final score?
#Remove "Sem1_Math" variable from the model and rebuild the model
import statsmodels.formula.api as sm
model2 = sm.ols(formula='Final_exam_marks ~ Sem1_Science+Sem2_Science+Sem2_Math', data=final_exam)
fitted2 = model2.fit()
fitted2.summary()
#Is there any change in R square or Adj R square
#How are Sem2_Math & Final score related now? As Sem2_Math score increases, what happens to Final score?
#Scatter Plot between the predictor variables
matlab.pyplot.scatter(final_exam.Sem1_Math,final_exam.Sem2_Math)
#Find the correlation between Sem1_Math & Sem2_Math
np.correlate(final_exam.Sem1_Math,final_exam.Sem2_Math)
########################Multicollinearity detection#########################
##Testing Multicollinearity
model1 = sm.ols(formula='Final_exam_marks ~ Sem1_Science+Sem2_Science+Sem1_Math+Sem2_Math', data=final_exam)
fitted1 = model1.fit()
fitted1.summary()
fitted1.summary2()
#Code for VIF Calculation
#Writing a function to calculate the VIF values
def vif_cal(input_data, dependent_col):
x_vars=input_data.drop([dependent_col], axis=1)
xvar_names=x_vars.columns
for i in range(0,xvar_names.shape[0]):
y=x_vars[xvar_names[i]]
x=x_vars[xvar_names.drop(xvar_names[i])]
rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared
vif=round(1/(1-rsq),2)
print (xvar_names[i], " VIF = " , vif)
#Calculating VIF values using that function
vif_cal(input_data=final_exam, dependent_col="Final_exam_marks")
#VIF Values given by statsmodels.stats.outliers_influence.variance_inflation_factor are not accurate
#import statsmodels.stats.outliers_influence
#help(statsmodels.stats.outliers_influence.variance_inflation_factor)
#statsmodels.stats.outliers_influence.variance_inflation_factor(final_exam.drop(["Final_exam_marks"], axis=1).as_matrix(), 0)
#statsmodels.stats.outliers_influence.variance_inflation_factor(final_exam.drop(["Final_exam_marks"], axis=1).as_matrix(), 1)
#statsmodels.stats.outliers_influence.variance_inflation_factor(final_exam.drop(["Final_exam_marks"], axis=1).as_matrix(), 2)
#statsmodels.stats.outliers_influence.variance_inflation_factor(final_exam.drop(["Final_exam_marks"], axis=1).as_matrix(), 3)
import statsmodels.formula.api as sm
model2 = sm.ols(formula='Final_exam_marks ~ Sem1_Science+Sem2_Science+Sem2_Math', data=final_exam)
fitted2 = model2.fit()
fitted2.summary()
vif_cal(input_data=final_exam.drop(["Sem1_Math"], axis=1), dependent_col="Final_exam_marks")
vif_cal(input_data=final_exam.drop(["Sem1_Math","Sem1_Science"], axis=1), dependent_col="Final_exam_marks")
## VIF
###############################################
##Multiple Regression model building
print("Loading webpage product sales and describing the data")
Webpage_Product_Sales=pd.read_csv("/Users/ibm/Downloads/Webpage_Product_Sales.csv")
print("Shape --> ",Webpage_Product_Sales.shape)
print("Columns --> ",Webpage_Product_Sales.columns)
print("Build the model with most of the variables to predict the target i.e sales")
import statsmodels.formula.api as sm_webpage
model_webpage = sm_webpage.ols(formula='Sales ~ Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Clicks_From_Serach_Engine+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted_webpage = model_webpage.fit()
fitted_webpage.summary()
print("ReBuild the model with optimal variables to predict the target i.e sales. Dropping Web_UI_Score & Clicks_From_Serach_Engine as it's P value is > 0.05 ")
import statsmodels.formula.api as sm_webpage
model_webpage = sm_webpage.ols(formula='Sales ~ Server_Down_time_Sec+Holiday+Special_Discount+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted_webpage = model_webpage.fit()
fitted_webpage.summary()
#VIF
print ("Initial Calculate VIF ")
vif_cal(Webpage_Product_Sales,"Sales")
#
#Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Clicks_From_Serach_Engine+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth
print ("We found there are two VIF values >= 5 so they have to be dropped from the model and reclculate VIF")
##Dropped Clicks_From_Serach_Engine and Online_Ad_Paid_ref_links based on VIF
import statsmodels.formula.api as sm
model2 = sm.ols(formula='Sales ~ Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted2 = model2.fit()
fitted2.summary()
#VIF for the updated model
#vif_cal(input_data=final_exam.drop(["Sem1_Math","Sem1_Science"], axis=1), dependent_col="Final_exam_marks")
vif_cal(Webpage_Product_Sales.drop(["Clicks_From_Serach_Engine","Web_UI_Score"],axis=1),"Sales")
#No VIF is more than 5
##Drop the less impacting variables based on p-values.
##Dropped Web_UI_Score based on P-value
print("Final model")
import statsmodels.formula.api as sm
model3 = sm.ols(formula='Sales ~ Server_Down_time_Sec+Holiday+Special_Discount+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted3 = model3.fit()
fitted3.summary()
#How many variables are there in the final model?
#10
#What is the R-squared of the final model?
print("Number of vars in final model ",8)
print("R Squared for final model is ",81)
# There are two ways of eleminating the vars (1) P value (2) VIF which one to use first. I see difference
# How to use the model to predict sales
#