Skip to content

Commit

Permalink
Ran code on COMPAS/score recidivism data
Browse files Browse the repository at this point in the history
	* seems to work well
	* need to fix error in MI code
  • Loading branch information
Octavio committed Jun 17, 2020
1 parent fd99790 commit 6a6c6c3
Show file tree
Hide file tree
Showing 8 changed files with 9,751 additions and 633 deletions.
Binary file modified simulations/boxplotsExperiment.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion simulations/createBoxplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
marker = 'x', color = colors[3], label = 'Proposed')

title = list(curDat['title'])[0]
plt.title(title) # this line was commented out for the paper
# plt.title(title) # this line was commented out for the paper
plt.legend()
plt.xlabel("Sample Size")
plt.ylabel("Estimated I(X;Y|Z)")
Expand Down
8 changes: 3 additions & 5 deletions simulations/createSimData.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,7 @@ def cmi4point(point_i, x, y, z, k, distArray):
nxz = knncmi.countNeighbors(coord_dists, rho, x_coords + z_coords)
nyz = knncmi.countNeighbors(coord_dists, rho, y_coords + z_coords)
nz = knncmi.countNeighbors(coord_dists, rho, z_coords)
if k == k_tilde:
xiProp = digamma(k_tilde) - digamma(nxz) - digamma(nyz) + digamma(nz)
else:
xiProp = np.log(k_tilde) - np.log(nxz) - np.log(nyz) + np.log(nz)
xiProp = digamma(k_tilde) - digamma(nxz) - digamma(nyz) + digamma(nz)
del k_tilde, nxz, nyz, nz

return np.array([xiFP, xiRAVK1, xiRAVK2, xiProp])
Expand Down Expand Up @@ -103,9 +100,10 @@ def parallelSim(seed):
runDat = runDat.append(out_row, ignore_index = True)
return(runDat)


if __name__ == '__main__':

runs = 10 #Change to 100 to replicate simulations in paper
runs = 100 # Change to 100 to replicate simulations in paper
samp_sizes = range(100, 1001, 100)
dims = [1]
dats = [cindep, corrUnif, mixture, discDep, contDep, contIndep, discIndep,
Expand Down
9,300 changes: 8,700 additions & 600 deletions simulations/expData.csv

Large diffs are not rendered by default.

Binary file added simulations/race_recid.pdf
Binary file not shown.
50 changes: 23 additions & 27 deletions simulations/realData.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@
import matplotlib.pyplot as plt
from functools import partial
import knncmi
from createSimData import cmi4point
from createSimData import cmi4point, cmi4

def import_raw_adult_data(rows = None):
col_names = ['age', 'workclass', 'income', 'edcat', 'ednum', 'maritalstat',
'occupation', 'relationship', 'race', 'sex', 'capgains',
'caploss', 'hrspweek', 'nativecountry']
dat = pd.read_csv("./adult.data", names=col_names, index_col=False)
if rows:
dat = dat.iloc[0:rows]
dat['income'] = dat['income']/100000
dat['ednum'] = dat['ednum']/100
#dat['race'] = dat['race'].replace({' Amer-Indian-Eskimo': ' Other',
# ' Asian-Pac-Islander': ' Other'})
print("data imported")
return dat
dat = pd.read_csv('../../compas-analysis/compas-scores.csv')
out = dat[(dat['decile_score'] != -1) & (dat['is_recid'] != -1) &
(abs(dat['days_b_screening_arrest']) < 90)][[
'race', 'decile_score', 'is_recid']]
out['decile_score'] = out['decile_score']/10
print(f"Data: column names: {list(out.columns)}, rows: {out.shape[0]}")
return out

def create_sub_array(dat, var_list, size, seed):
np.random.seed(seed)
Expand Down Expand Up @@ -85,24 +80,25 @@ def make_plot(dat):
plt.legend()
plt.xlabel("Sample Size")
plt.ylabel("Estimated I(X;Y|Z)")
plt.savefig('incomeRace.pdf')
plt.savefig('race_recid.pdf')


if __name__ == '__main__':

runs = 100
k = 7
x = [2] # 'income'
y = [8, 9] # 'race, sex'
z = [4] # 'ednum'
x = [0] # race
y = [1] # compas score
z = [2] # recidivism
dat = import_raw_adult_data()
#p = multiprocessing.Pool(os.cpu_count())
#result = p.map(run, range(runs))
#out = pd.concat(result)
#out.to_csv("income_results.csv")
#make_plot(out)
col_names = ['age', 'workclass', 'income', 'edcat', 'ednum', 'maritalstat',
'occupation', 'relationship', 'race', 'sex', 'capgains',
'caploss', 'hrspweek', 'nativecountry']
dat = pd.read_csv("./adult.data", names=col_names, index_col=False)
print(dat.groupby(['sex', 'race'])['income', 'ednum'].median())
p = multiprocessing.Pool(os.cpu_count())
result = p.map(run, range(runs))
out = pd.concat(result)
out.to_csv("recid.csv")
make_plot(out)
print("By race")
pd.set_option('display.max_columns', 16)
print(dat.groupby(['race']).describe())
print("CMI ests on all data")
tots = cmi4(x,y,z,k,dat)
print(f"FP: {tots[0]}, RAVK1: {tots[1]}, RAVK2: {tots[2]}, Prop: {tots[3]}")
1,001 changes: 1,001 additions & 0 deletions simulations/recid.csv

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions simulations/recid_all.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Data: column names: ['race', 'decile_score', 'is_recid'], rows: 10010
By race
decile_score \
count mean std min 25% 50% 75% max
race
African-American 4960.0 0.525585 0.287048 0.1 0.3 0.50 0.80 1.0
Asian 50.0 0.272000 0.241627 0.1 0.1 0.15 0.30 1.0
Caucasian 3503.0 0.359806 0.255125 0.1 0.1 0.30 0.50 1.0
Hispanic 885.0 0.325989 0.252504 0.1 0.1 0.20 0.50 1.0
Native American 31.0 0.451613 0.280322 0.1 0.2 0.40 0.65 1.0
Other 581.0 0.277453 0.226574 0.1 0.1 0.20 0.40 1.0

is_recid
count mean std min 25% 50% 75% max
race
African-American 4960.0 0.406048 0.491143 0.0 0.0 0.0 1.0 1.0
Asian 50.0 0.200000 0.404061 0.0 0.0 0.0 0.0 1.0
Caucasian 3503.0 0.287753 0.452780 0.0 0.0 0.0 1.0 1.0
Hispanic 885.0 0.254237 0.435678 0.0 0.0 0.0 1.0 1.0
Native American 31.0 0.322581 0.475191 0.0 0.0 0.0 1.0 1.0
Other 581.0 0.247849 0.432135 0.0 0.0 0.0 0.0 1.0
CMI ests on all data
FP: 2.427668231063506, RAVK1: 0.038202501107253324, RAVK2: 0.04104235793626344, Prop: 0.04555590623316086

0 comments on commit 6a6c6c3

Please sign in to comment.