Skip to content

Commit

Permalink
added detailed error messages for placing downloaded datasets (Truste…
Browse files Browse the repository at this point in the history
  • Loading branch information
hoffmansc authored Sep 14, 2018
1 parent 1f4157c commit 614ff7a
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 52 deletions.
6 changes: 3 additions & 3 deletions aif360/data/raw/compas/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
## Source:
<https://github.com/propublica/compas-analysis>

This dataset is used to assess the likelihood that a criminal defendant will re-offend. The dataset used consisted of
This dataset is used to assess the likelihood that a criminal defendant will re-offend. The dataset used consisted of

## Download instructions

Download the following file and place them as-is in the current folder
Download the following file and place it as-is in the current folder

1. [compas-scores-two-years.csv](https://github.com/propublica/compas-analysis/blob/master/compas-scores-two-years.csv)


## Relevant Papers

* J. Angwin, J. Larson, S. Mattu, L. Kirchner, [“Machine bias: There’s software used across the country to predict future criminals. And it’s biased against blacks,”](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing) ProPublica, 23 May 2016
* J. Angwin, J. Larson, S. Mattu, L. Kirchner, [“Machine bias: There’s software used across the country to predict future criminals. And it’s biased against blacks,”](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing) ProPublica, 23 May 2016
24 changes: 18 additions & 6 deletions aif360/datasets/adult_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,30 @@ def __init__(self, label_name='income-per-year',
"""

train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/adult/adult.data')
'..', 'data', 'raw', 'adult', 'adult.data')
test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/adult/adult.test')
'..', 'data', 'raw', 'adult', 'adult.test')
# as given by adult.names
column_names = ['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation', 'relationship',
'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country', 'income-per-year']
train = pd.read_csv(train_path, header=None, names=column_names,
skipinitialspace=True, na_values=na_values)
test = pd.read_csv(test_path, header=0, names=column_names,
skipinitialspace=True, na_values=na_values)
try:
train = pd.read_csv(train_path, header=None, names=column_names,
skipinitialspace=True, na_values=na_values)
test = pd.read_csv(test_path, header=0, names=column_names,
skipinitialspace=True, na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please download the following files:")
print("\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")
print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test")
print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names")
print("\nand place them, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'adult'))))
import sys
sys.exit(1)

df = pd.concat([train, test], ignore_index=True)

Expand Down
15 changes: 13 additions & 2 deletions aif360/datasets/bank_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,19 @@ def __init__(self, label_name='y', favorable_classes=['yes'],
"""

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/bank/bank-additional-full.csv')
df = pd.read_csv(filepath, sep=';', na_values=na_values)
'..', 'data', 'raw', 'bank', 'bank-additional-full.csv')

try:
df = pd.read_csv(filepath, sep=';', na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please download the following file:")
print("\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip")
print("\nunzip it and place the files, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'bank'))))
import sys
sys.exit(1)

super(BankDataset, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
Expand Down
15 changes: 13 additions & 2 deletions aif360/datasets/compas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,19 @@ def __init__(self, label_name='two_year_recid', favorable_classes=[0],
"""

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/compas/compas-scores-two-years.csv')
df = pd.read_csv(filepath, index_col='id', na_values=na_values)
'..', 'data', 'raw', 'compas', 'compas-scores-two-years.csv')

try:
df = pd.read_csv(filepath, index_col='id', na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please download the following file:")
print("\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
print("\nand place it, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'compas'))))
import sys
sys.exit(1)

super(CompasDataset, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
Expand Down
17 changes: 14 additions & 3 deletions aif360/datasets/german_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, label_name='credit', favorable_classes=[1],
"""

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/german/german.data')
'..', 'data', 'raw', 'german', 'german.data')
# as given by german.doc
column_names = ['status', 'month', 'credit_history',
'purpose', 'credit_amount', 'savings', 'employment',
Expand All @@ -79,8 +79,19 @@ def __init__(self, label_name='credit', favorable_classes=[1],
'installment_plans', 'housing', 'number_of_credits',
'skill_level', 'people_liable_for', 'telephone',
'foreign_worker', 'credit']
df = pd.read_csv(filepath, sep=' ', header=None, names=column_names,
na_values=na_values)
try:
df = pd.read_csv(filepath, sep=' ', header=None, names=column_names,
na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please download the following files:")
print("\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc")
print("\nand place them, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'german'))))
import sys
sys.exit(1)

super(GermanDataset, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
Expand Down
33 changes: 22 additions & 11 deletions aif360/datasets/meps_dataset_panel19_fy2015.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def race(row):
df = df[df['PANEL'] == 19]

# RENAME COLUMNS
df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
Expand All @@ -44,7 +44,7 @@ def race(row):
df = df[df['AGE'] >= 0] # remove values -1

df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9

df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9

df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
Expand All @@ -55,14 +55,14 @@ def race(row):

def utilization(row):
return row['OBTOTV15'] + row['OPTOTV15'] + row['ERTOT15'] + row['IPNGTD15'] + row['HHTOTD15']

df['TOTEXP15'] = df.apply(lambda row: utilization(row), axis=1)
lessE = df['TOTEXP15'] < 10.0
df.loc[lessE,'TOTEXP15'] = 0.0
moreE = df['TOTEXP15'] >= 10.0
df.loc[moreE,'TOTEXP15'] = 1.0

df = df.rename(columns = {'TOTEXP15' : 'UTILIZATION'})
df = df.rename(columns = {'TOTEXP15' : 'UTILIZATION'})
return df


Expand All @@ -71,30 +71,42 @@ class MEPSDataset19(StandardDataset):
See :file:`aif360/data/raw/meps/README.md`.
"""

def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
protected_attribute_names=['RACE'],
privileged_classes=[['White']],
instance_weights_name='PERWT15F',
categorical_features=['REGION','SEX','MARRY',
categorical_features=['REGION','SEX','MARRY',
'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
'PHQ242','EMPST','POVCAT','INSCOV'],
features_to_keep=['REGION','AGE','SEX','RACE','MARRY',
features_to_keep=['REGION','AGE','SEX','RACE','MARRY',
'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42','PCS42',
'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION','PERWT15F'],
'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION','PERWT15F'],
features_to_drop=[],
na_values=[], custom_preprocessing=default_preprocessing,
metadata=default_mappings):

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/meps/h181.csv')
df = pd.read_csv(filepath, sep=',', na_values=na_values)
'..', 'data', 'raw', 'meps', 'h181.csv')

try:
df = pd.read_csv(filepath, sep=',', na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please follow the instructions in:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps', 'README.md'))))
print("\n to download and convert the 2015 data and place the final h181.csv file, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps'))))
import sys
sys.exit(1)

super(MEPSDataset19, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
Expand All @@ -105,4 +117,3 @@ def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
features_to_keep=features_to_keep,
features_to_drop=features_to_drop, na_values=na_values,
custom_preprocessing=custom_preprocessing, metadata=metadata)

35 changes: 23 additions & 12 deletions aif360/datasets/meps_dataset_panel20_fy2015.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,21 @@ def race(row):
df = df.rename(columns = {'RACEV2X' : 'RACE'})

df = df[df['PANEL'] == 20]

# RENAME COLUMNS
df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
'POVCAT15' : 'POVCAT', 'INSCOV15' : 'INSCOV'})

df = df[df['REGION'] >= 0] # remove values -1
df = df[df['AGE'] >= 0] # remove values -1

df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9

df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9

df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
Expand All @@ -55,7 +55,7 @@ def race(row):

def utilization(row):
return row['OBTOTV15'] + row['OPTOTV15'] + row['ERTOT15'] + row['IPNGTD15'] + row['HHTOTD15']

df['TOTEXP15'] = df.apply(lambda row: utilization(row), axis=1)
lessE = df['TOTEXP15'] < 10.0
df.loc[lessE,'TOTEXP15'] = 0.0
Expand All @@ -71,7 +71,7 @@ class MEPSDataset20(StandardDataset):
See :file:`aif360/data/raw/meps/README.md`.
"""

def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
protected_attribute_names=['RACE'],
privileged_classes=[['White']],
Expand All @@ -81,21 +81,33 @@ def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42', 'ADSMOK42', 'PHQ242',
'EMPST','POVCAT','INSCOV'],
'EMPST','POVCAT','INSCOV'],
features_to_keep=['REGION','AGE','SEX','RACE','MARRY',
'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42', 'ADSMOK42',
'PCS42',
'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION', 'PERWT15F'],
'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION', 'PERWT15F'],
features_to_drop=[],
na_values=[], custom_preprocessing=default_preprocessing,
metadata=default_mappings):

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../data/raw/meps/h181.csv')
df = pd.read_csv(filepath, sep=',', na_values=na_values)
'..', 'data', 'raw', 'meps', 'h181.csv')

try:
df = pd.read_csv(filepath, sep=',', na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please follow the instructions in:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps', 'README.md'))))
print("\n to download and convert the 2015 data and place the final h181.csv file, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps'))))
import sys
sys.exit(1)

super(MEPSDataset20, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
Expand All @@ -106,4 +118,3 @@ def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
features_to_keep=features_to_keep,
features_to_drop=features_to_drop, na_values=na_values,
custom_preprocessing=custom_preprocessing, metadata=metadata)

Loading

0 comments on commit 614ff7a

Please sign in to comment.