Ran code on COMPAS/score recidivism data

* seems to work well * need to fix error in MI code
omesner · Jun 17, 2020 · 6a6c6c3 · 6a6c6c3
1 parent fd99790
commit 6a6c6c3
Show file tree

Hide file tree

Showing 8 changed files with 9,751 additions and 633 deletions.
diff --git a/simulations/boxplotsExperiment.pdf b/simulations/boxplotsExperiment.pdf
diff --git a/simulations/createBoxplots.py b/simulations/createBoxplots.py
@@ -48,7 +48,7 @@
              marker = 'x', color = colors[3], label = 'Proposed')
 
     title = list(curDat['title'])[0]
-    plt.title(title) # this line was commented out for the paper
+    # plt.title(title) # this line was commented out for the paper
     plt.legend()
     plt.xlabel("Sample Size")
     plt.ylabel("Estimated I(X;Y|Z)")

diff --git a/simulations/createSimData.py b/simulations/createSimData.py
@@ -54,10 +54,7 @@ def cmi4point(point_i, x, y, z, k, distArray):
     nxz = knncmi.countNeighbors(coord_dists, rho, x_coords + z_coords)
     nyz = knncmi.countNeighbors(coord_dists, rho, y_coords + z_coords)
     nz = knncmi.countNeighbors(coord_dists, rho, z_coords)
-    if k == k_tilde:
-        xiProp = digamma(k_tilde) - digamma(nxz) - digamma(nyz) + digamma(nz)
-    else:
-        xiProp = np.log(k_tilde) - np.log(nxz) - np.log(nyz) + np.log(nz)
+    xiProp = digamma(k_tilde) - digamma(nxz) - digamma(nyz) + digamma(nz)
     del k_tilde, nxz, nyz, nz
 
     return np.array([xiFP, xiRAVK1, xiRAVK2, xiProp])
@@ -103,9 +100,10 @@ def parallelSim(seed):
                     runDat = runDat.append(out_row, ignore_index = True)
     return(runDat)
 
+
 if __name__ == '__main__':
 
-    runs = 10 #Change to 100 to replicate simulations in paper
+    runs = 100  # Change to 100 to replicate simulations in paper
     samp_sizes = range(100, 1001, 100)
     dims = [1]
     dats = [cindep, corrUnif, mixture, discDep, contDep, contIndep, discIndep,

diff --git a/simulations/expData.csv b/simulations/expData.csv
diff --git a/simulations/race_recid.pdf b/simulations/race_recid.pdf
diff --git a/simulations/realData.py b/simulations/realData.py
@@ -8,21 +8,16 @@
 import matplotlib.pyplot as plt
 from functools import partial
 import knncmi
-from createSimData import cmi4point
+from createSimData import cmi4point, cmi4
 
 def import_raw_adult_data(rows = None):
-    col_names = ['age', 'workclass', 'income', 'edcat', 'ednum', 'maritalstat',
-                 'occupation', 'relationship', 'race', 'sex', 'capgains',
-                 'caploss', 'hrspweek', 'nativecountry']
-    dat = pd.read_csv("./adult.data", names=col_names, index_col=False)
-    if rows:
-        dat = dat.iloc[0:rows]
-    dat['income'] = dat['income']/100000
-    dat['ednum'] = dat['ednum']/100
-    #dat['race'] = dat['race'].replace({' Amer-Indian-Eskimo': ' Other',
-    #                                   ' Asian-Pac-Islander': ' Other'})
-    print("data imported")
-    return dat
+    dat = pd.read_csv('../../compas-analysis/compas-scores.csv')
+    out = dat[(dat['decile_score'] != -1) & (dat['is_recid'] != -1) &
+              (abs(dat['days_b_screening_arrest']) < 90)][[
+                  'race', 'decile_score', 'is_recid']]
+    out['decile_score'] = out['decile_score']/10
+    print(f"Data: column names: {list(out.columns)}, rows: {out.shape[0]}")
+    return out
 
 def create_sub_array(dat, var_list, size, seed):
     np.random.seed(seed)
@@ -85,24 +80,25 @@ def make_plot(dat):
     plt.legend()
     plt.xlabel("Sample Size")
     plt.ylabel("Estimated I(X;Y|Z)")
-    plt.savefig('incomeRace.pdf')
+    plt.savefig('race_recid.pdf')
 
 
 if __name__ == '__main__':
 
     runs = 100
     k = 7
-    x = [2] #  'income'
-    y = [8, 9] #  'race, sex'
-    z = [4] #  'ednum'
+    x = [0] # race
+    y = [1] # compas score 
+    z = [2] # recidivism
     dat = import_raw_adult_data()
-    #p = multiprocessing.Pool(os.cpu_count())
-    #result =  p.map(run, range(runs))
-    #out = pd.concat(result)
-    #out.to_csv("income_results.csv")
-    #make_plot(out)
-    col_names = ['age', 'workclass', 'income', 'edcat', 'ednum', 'maritalstat',
-                 'occupation', 'relationship', 'race', 'sex', 'capgains',
-                 'caploss', 'hrspweek', 'nativecountry']
-    dat = pd.read_csv("./adult.data", names=col_names, index_col=False)
-    print(dat.groupby(['sex', 'race'])['income', 'ednum'].median())
+    p = multiprocessing.Pool(os.cpu_count())
+    result =  p.map(run, range(runs))
+    out = pd.concat(result)
+    out.to_csv("recid.csv")
+    make_plot(out)
+    print("By race")
+    pd.set_option('display.max_columns', 16)
+    print(dat.groupby(['race']).describe())
+    print("CMI ests on all data")
+    tots = cmi4(x,y,z,k,dat)
+    print(f"FP: {tots[0]}, RAVK1: {tots[1]}, RAVK2: {tots[2]}, Prop: {tots[3]}")
diff --git a/simulations/recid.csv b/simulations/recid.csv
diff --git a/simulations/recid_all.txt b/simulations/recid_all.txt
@@ -0,0 +1,23 @@
+Data: column names: ['race', 'decile_score', 'is_recid'], rows: 10010
+By race
+                 decile_score                                                 \
+                        count      mean       std  min  25%   50%   75%  max   
+race                                                                           
+African-American       4960.0  0.525585  0.287048  0.1  0.3  0.50  0.80  1.0   
+Asian                    50.0  0.272000  0.241627  0.1  0.1  0.15  0.30  1.0   
+Caucasian              3503.0  0.359806  0.255125  0.1  0.1  0.30  0.50  1.0   
+Hispanic                885.0  0.325989  0.252504  0.1  0.1  0.20  0.50  1.0   
+Native American          31.0  0.451613  0.280322  0.1  0.2  0.40  0.65  1.0   
+Other                   581.0  0.277453  0.226574  0.1  0.1  0.20  0.40  1.0   
+
+                 is_recid                                               
+                    count      mean       std  min  25%  50%  75%  max  
+race                                                                    
+African-American   4960.0  0.406048  0.491143  0.0  0.0  0.0  1.0  1.0  
+Asian                50.0  0.200000  0.404061  0.0  0.0  0.0  0.0  1.0  
+Caucasian          3503.0  0.287753  0.452780  0.0  0.0  0.0  1.0  1.0  
+Hispanic            885.0  0.254237  0.435678  0.0  0.0  0.0  1.0  1.0  
+Native American      31.0  0.322581  0.475191  0.0  0.0  0.0  1.0  1.0  
+Other               581.0  0.247849  0.432135  0.0  0.0  0.0  0.0  1.0  
+CMI ests on all data
+FP: 2.427668231063506, RAVK1: 0.038202501107253324, RAVK2: 0.04104235793626344, Prop: 0.04555590623316086