Skip to content

Commit

Permalink
logistic regression
Browse files Browse the repository at this point in the history
  • Loading branch information
ujjwalkarn committed Oct 7, 2015
1 parent 3c170b6 commit 0843e36
Show file tree
Hide file tree
Showing 3 changed files with 554 additions and 0 deletions.
152 changes: 152 additions & 0 deletions Logistic Regression with StatsModels/logistic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
Created on Wed Sep 09 12:38:16 2015
@author: ujjwal.karn
"""

import pandas as pd #for handling datasets
import statsmodels.api as sm #for statistical modeling
import pylab as pl #for plotting
import numpy as np #for numerical computation

# read the data in
dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv")
dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv")

# take a look at the dataset
print dfTrain.head()
# admit gre gpa prestige
#0 0 380 3.61 good
#1 1 660 3.67 good
#2 1 800 4.00 best
#3 1 640 3.19 ok
#4 0 520 2.93 ok

print dfTest.head()
# gre gpa prestige
#0 640 3.30 veryGood
#1 660 3.60 good
#2 400 3.15 veryGood
#3 680 3.98 veryGood
#4 220 2.83 good


# summarize the data
print dfTrain.describe()
# admit gre gpa
#count 300.000000 300.000000 300.000000
#mean 0.306667 590.866667 3.386233
#std 0.461880 117.717630 0.374880
#min 0.000000 300.000000 2.260000
#25% 0.000000 515.000000 3.130000
#50% 0.000000 600.000000 3.390000
#75% 1.000000 680.000000 3.642500
#max 1.000000 800.000000 4.000000

# take a look at the standard deviation of each column
print dfTrain.std()
#admit 0.46188
#gre 117.71763
#gpa 0.37488

# frequency table cutting presitge and whether or not someone was admitted
print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit'])
#prestige best good ok veryGood
#admit
#0 20 73 47 68
#1 25 19 9 39

#explore data
dfTrain.groupby('admit').mean()
# gre gpa
#admit
#0 573.461538 3.336587
#1 630.217391 3.498478

# plot one column
dfTrain['gpa'].hist()
pl.title('Histogram of GPA')
pl.xlabel('GPA')
pl.ylabel('Frequency')
pl.show()

# barplot of gre score grouped by admission status (True or False)
pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar')
pl.title('GRE score by Admission Status')
pl.xlabel('GRE score')
pl.ylabel('Frequency')
pl.show()

# dummify prestige
dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige')
print dummy_ranks.head()
# prestige_best prestige_good prestige_ok prestige_veryGood
#0 0 1 0 0
#1 0 1 0 0
#2 1 0 0 0
#3 0 0 1 0
#4 0 0 1 0

# create a clean data frame for the regression
cols_to_keep = ['admit', 'gre', 'gpa']
data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':])
print data.head()
# admit gre gpa prestige_good prestige_ok prestige_veryGood
#0 0 380 3.61 1 0 0
#1 1 660 3.67 1 0 0
#2 1 800 4.00 0 0 0
#3 1 640 3.19 0 1 0
#4 0 520 2.93 0 1 0

# manually add the intercept
data['intercept'] = 1.0

print data.head()

train_cols = data.columns[1:]
print data.columns[1:]
# Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object')

#Logistic Regression
logit = sm.Logit(data['admit'], data[train_cols])

# fit the model
result = logit.fit()
print result.summary()

# recreate the dummy variables
dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige')
print dummy_ranks_test

#create intercept column
dfTest['intercept'] = 1.0

# keep only what we need for making predictions
cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept']
dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':])

dfTest.head()
# make predictions on the enumerated dataset
dfTest['admit_pred'] = result.predict(dfTest[train_cols])

#see probabilities
print dfTest.head()

#convert probabilities to 'yes' 'no'
dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no')
print dfTest.head()

cols= ['gre', 'gpa', 'admit_yn']
dfTest[cols].groupby('admit_yn').mean()
# gre gpa
#admit_yn
#no 556.585366 3.324268
#yes 676.666667 3.750000

cols= ['gre', 'gpa', 'admit_yn']
dfTest[cols].groupby('admit_yn').mean()
# gre gpa
#admit_yn
#no 556.585366 3.324268
#yes 676.666667 3.750000

dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',')
101 changes: 101 additions & 0 deletions Logistic Regression with StatsModels/test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
gre,gpa,prestige
640,3.3,veryGood
660,3.6,good
400,3.15,veryGood
680,3.98,veryGood
220,2.83,good
580,3.46,ok
540,3.17,best
580,3.51,veryGood
540,3.13,veryGood
440,2.98,good
560,4,good
660,3.67,veryGood
660,3.77,good
520,3.65,ok
540,3.46,ok
300,2.84,veryGood
340,3,veryGood
780,3.63,ok
480,3.71,ok
540,3.28,best
460,3.14,good
460,3.58,veryGood
500,3.01,ok
420,2.69,veryGood
520,2.7,good
680,3.9,best
680,3.31,veryGood
560,3.48,veryGood
580,3.34,veryGood
500,2.93,ok
740,4,good
660,3.59,good
420,2.96,best
560,3.43,good
460,3.64,good
620,3.71,best
520,3.15,good
620,3.09,ok
540,3.2,best
660,3.47,good
500,3.23,ok
560,2.65,good
500,3.95,ok
580,3.06,veryGood
520,3.35,good
500,3.03,good
600,3.35,veryGood
580,3.8,veryGood
400,3.36,veryGood
620,2.85,veryGood
780,4,veryGood
620,3.43,good
580,3.12,good
700,3.52,veryGood
540,3.78,veryGood
760,2.81,best
700,3.27,veryGood
720,3.31,best
560,3.69,good
720,3.94,good
520,4,best
540,3.49,best
680,3.14,veryGood
460,3.44,veryGood
560,3.36,best
480,2.78,good
460,2.93,good
620,3.63,good
580,4,best
800,3.89,veryGood
540,3.77,veryGood
680,3.76,good
680,2.42,best
620,3.37,best
560,3.78,veryGood
560,3.49,ok
620,3.63,veryGood
800,4,veryGood
640,3.12,good
540,2.7,veryGood
700,3.65,veryGood
540,3.49,veryGood
540,3.51,veryGood
660,4,best
480,2.62,veryGood
420,3.02,best
740,3.86,veryGood
580,3.36,veryGood
640,3.17,veryGood
640,3.51,veryGood
800,3.05,veryGood
660,3.88,veryGood
600,3.38,good
620,3.75,veryGood
460,3.99,good
620,4,veryGood
560,3.04,good
460,2.63,veryGood
700,3.65,veryGood
600,3.89,good
Loading

0 comments on commit 0843e36

Please sign in to comment.