forked from ujjwalkarn/DataSciencePython
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3c170b6
commit 0843e36
Showing
3 changed files
with
554 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
""" | ||
Created on Wed Sep 09 12:38:16 2015 | ||
@author: ujjwal.karn | ||
""" | ||
|
||
import pandas as pd #for handling datasets | ||
import statsmodels.api as sm #for statistical modeling | ||
import pylab as pl #for plotting | ||
import numpy as np #for numerical computation | ||
|
||
# read the data in | ||
dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv") | ||
dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv") | ||
|
||
# take a look at the dataset | ||
print dfTrain.head() | ||
# admit gre gpa prestige | ||
#0 0 380 3.61 good | ||
#1 1 660 3.67 good | ||
#2 1 800 4.00 best | ||
#3 1 640 3.19 ok | ||
#4 0 520 2.93 ok | ||
|
||
print dfTest.head() | ||
# gre gpa prestige | ||
#0 640 3.30 veryGood | ||
#1 660 3.60 good | ||
#2 400 3.15 veryGood | ||
#3 680 3.98 veryGood | ||
#4 220 2.83 good | ||
|
||
|
||
# summarize the data | ||
print dfTrain.describe() | ||
# admit gre gpa | ||
#count 300.000000 300.000000 300.000000 | ||
#mean 0.306667 590.866667 3.386233 | ||
#std 0.461880 117.717630 0.374880 | ||
#min 0.000000 300.000000 2.260000 | ||
#25% 0.000000 515.000000 3.130000 | ||
#50% 0.000000 600.000000 3.390000 | ||
#75% 1.000000 680.000000 3.642500 | ||
#max 1.000000 800.000000 4.000000 | ||
|
||
# take a look at the standard deviation of each column | ||
print dfTrain.std() | ||
#admit 0.46188 | ||
#gre 117.71763 | ||
#gpa 0.37488 | ||
|
||
# frequency table cutting presitge and whether or not someone was admitted | ||
print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit']) | ||
#prestige best good ok veryGood | ||
#admit | ||
#0 20 73 47 68 | ||
#1 25 19 9 39 | ||
|
||
#explore data | ||
dfTrain.groupby('admit').mean() | ||
# gre gpa | ||
#admit | ||
#0 573.461538 3.336587 | ||
#1 630.217391 3.498478 | ||
|
||
# plot one column | ||
dfTrain['gpa'].hist() | ||
pl.title('Histogram of GPA') | ||
pl.xlabel('GPA') | ||
pl.ylabel('Frequency') | ||
pl.show() | ||
|
||
# barplot of gre score grouped by admission status (True or False) | ||
pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar') | ||
pl.title('GRE score by Admission Status') | ||
pl.xlabel('GRE score') | ||
pl.ylabel('Frequency') | ||
pl.show() | ||
|
||
# dummify prestige | ||
dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige') | ||
print dummy_ranks.head() | ||
# prestige_best prestige_good prestige_ok prestige_veryGood | ||
#0 0 1 0 0 | ||
#1 0 1 0 0 | ||
#2 1 0 0 0 | ||
#3 0 0 1 0 | ||
#4 0 0 1 0 | ||
|
||
# create a clean data frame for the regression | ||
cols_to_keep = ['admit', 'gre', 'gpa'] | ||
data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':]) | ||
print data.head() | ||
# admit gre gpa prestige_good prestige_ok prestige_veryGood | ||
#0 0 380 3.61 1 0 0 | ||
#1 1 660 3.67 1 0 0 | ||
#2 1 800 4.00 0 0 0 | ||
#3 1 640 3.19 0 1 0 | ||
#4 0 520 2.93 0 1 0 | ||
|
||
# manually add the intercept | ||
data['intercept'] = 1.0 | ||
|
||
print data.head() | ||
|
||
train_cols = data.columns[1:] | ||
print data.columns[1:] | ||
# Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object') | ||
|
||
#Logistic Regression | ||
logit = sm.Logit(data['admit'], data[train_cols]) | ||
|
||
# fit the model | ||
result = logit.fit() | ||
print result.summary() | ||
|
||
# recreate the dummy variables | ||
dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige') | ||
print dummy_ranks_test | ||
|
||
#create intercept column | ||
dfTest['intercept'] = 1.0 | ||
|
||
# keep only what we need for making predictions | ||
cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept'] | ||
dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':]) | ||
|
||
dfTest.head() | ||
# make predictions on the enumerated dataset | ||
dfTest['admit_pred'] = result.predict(dfTest[train_cols]) | ||
|
||
#see probabilities | ||
print dfTest.head() | ||
|
||
#convert probabilities to 'yes' 'no' | ||
dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no') | ||
print dfTest.head() | ||
|
||
cols= ['gre', 'gpa', 'admit_yn'] | ||
dfTest[cols].groupby('admit_yn').mean() | ||
# gre gpa | ||
#admit_yn | ||
#no 556.585366 3.324268 | ||
#yes 676.666667 3.750000 | ||
|
||
cols= ['gre', 'gpa', 'admit_yn'] | ||
dfTest[cols].groupby('admit_yn').mean() | ||
# gre gpa | ||
#admit_yn | ||
#no 556.585366 3.324268 | ||
#yes 676.666667 3.750000 | ||
|
||
dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
gre,gpa,prestige | ||
640,3.3,veryGood | ||
660,3.6,good | ||
400,3.15,veryGood | ||
680,3.98,veryGood | ||
220,2.83,good | ||
580,3.46,ok | ||
540,3.17,best | ||
580,3.51,veryGood | ||
540,3.13,veryGood | ||
440,2.98,good | ||
560,4,good | ||
660,3.67,veryGood | ||
660,3.77,good | ||
520,3.65,ok | ||
540,3.46,ok | ||
300,2.84,veryGood | ||
340,3,veryGood | ||
780,3.63,ok | ||
480,3.71,ok | ||
540,3.28,best | ||
460,3.14,good | ||
460,3.58,veryGood | ||
500,3.01,ok | ||
420,2.69,veryGood | ||
520,2.7,good | ||
680,3.9,best | ||
680,3.31,veryGood | ||
560,3.48,veryGood | ||
580,3.34,veryGood | ||
500,2.93,ok | ||
740,4,good | ||
660,3.59,good | ||
420,2.96,best | ||
560,3.43,good | ||
460,3.64,good | ||
620,3.71,best | ||
520,3.15,good | ||
620,3.09,ok | ||
540,3.2,best | ||
660,3.47,good | ||
500,3.23,ok | ||
560,2.65,good | ||
500,3.95,ok | ||
580,3.06,veryGood | ||
520,3.35,good | ||
500,3.03,good | ||
600,3.35,veryGood | ||
580,3.8,veryGood | ||
400,3.36,veryGood | ||
620,2.85,veryGood | ||
780,4,veryGood | ||
620,3.43,good | ||
580,3.12,good | ||
700,3.52,veryGood | ||
540,3.78,veryGood | ||
760,2.81,best | ||
700,3.27,veryGood | ||
720,3.31,best | ||
560,3.69,good | ||
720,3.94,good | ||
520,4,best | ||
540,3.49,best | ||
680,3.14,veryGood | ||
460,3.44,veryGood | ||
560,3.36,best | ||
480,2.78,good | ||
460,2.93,good | ||
620,3.63,good | ||
580,4,best | ||
800,3.89,veryGood | ||
540,3.77,veryGood | ||
680,3.76,good | ||
680,2.42,best | ||
620,3.37,best | ||
560,3.78,veryGood | ||
560,3.49,ok | ||
620,3.63,veryGood | ||
800,4,veryGood | ||
640,3.12,good | ||
540,2.7,veryGood | ||
700,3.65,veryGood | ||
540,3.49,veryGood | ||
540,3.51,veryGood | ||
660,4,best | ||
480,2.62,veryGood | ||
420,3.02,best | ||
740,3.86,veryGood | ||
580,3.36,veryGood | ||
640,3.17,veryGood | ||
640,3.51,veryGood | ||
800,3.05,veryGood | ||
660,3.88,veryGood | ||
600,3.38,good | ||
620,3.75,veryGood | ||
460,3.99,good | ||
620,4,veryGood | ||
560,3.04,good | ||
460,2.63,veryGood | ||
700,3.65,veryGood | ||
600,3.89,good |
Oops, something went wrong.