Skip to content

Commit

Permalink
fixed/renamed/reordered/added some attributes
Browse files Browse the repository at this point in the history
* fixed German 'age' from being dropped
* renamed two_year_recid labels to 'Survived' and 'Recidivated' to match ProPublica article
* reordered COMPAS categories to 'Male' < 'Female'
* added 'foreign_worker' protected attribute for German
  • Loading branch information
hoffmansc committed Feb 19, 2020
1 parent af3c824 commit e82b56f
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 16 deletions.
19 changes: 13 additions & 6 deletions aif360/sklearn/datasets/compas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@ def fetch_compas(data_home=None, binary_race=False,
Optionally binarizes 'race' to 'Caucasian' (privileged) or
'African-American' (unprivileged). The other protected attribute is 'sex'
('Male' is *unprivileged* and 'Female' is *privileged*). The outcome
variable is 'no recid.' (favorable) if the person was not accused of a crime
within two years or 'did recid.' (unfavorable) if they were.
variable is 'Survived' (favorable) if the person was not accused of a crime
within two years or 'Recidivated' (unfavorable) if they were.
Note:
The values for the 'sex' variable if numeric_only is ``True`` are 1 for
'Female and 0 for 'Male' -- opposite the convention of other datasets.
Args:
data_home (string, optional): Specify another download and cache folder
Expand Down Expand Up @@ -59,16 +63,19 @@ def fetch_compas(data_home=None, binary_race=False,
for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']:
df[col] = df[col].astype('category')

# 'did recid' < 'no recid'
df.two_year_recid = df.two_year_recid.replace({0: 'no recid.',
1: 'did recid.'}).astype('category').cat.as_ordered()
# 'Survived' < 'Recidivated'
cats = ['Survived', 'Recidivated']
df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category')
df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True)

if binary_race:
# 'African-American' < 'Caucasian'
df.race = df.race.cat.set_categories(['African-American', 'Caucasian'],
ordered=True)

df.sex = df.sex.astype('category').cat.as_ordered() # 'Female' < 'Male'
# 'Male' < 'Female'
df.sex = df.sex.astype('category').cat.reorder_categories(
['Male', 'Female'], ordered=True)

return standardize_dataset(df, prot_attr=['sex', 'race'],
target='two_year_recid', usecols=usecols,
Expand Down
14 changes: 8 additions & 6 deletions aif360/sklearn/datasets/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
dropcols=[], numeric_only=False, dropna=True):
"""Load the Adult Census Income Dataset.
Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged).
The other protected attribute is 'sex' ('Male' is privileged and 'Female' is
Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). The
other protected attribute is 'sex' ('Male' is privileged and 'Female' is
unprivileged). The outcome variable is 'annual-income': '>50K' (favorable)
or '<=50K' (unfavorable).
Expand Down Expand Up @@ -151,7 +151,8 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good'

# binarize protected attribute (but not corresponding feature)
age = (pd.cut(df.age, [0, 25, 100], labels=numeric_only and ['young', 'aged'])
age = (pd.cut(df.age, [0, 25, 100],
labels=False if numeric_only else ['young', 'aged'])
if binary_age else 'age')

# Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female'
Expand All @@ -161,9 +162,10 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
df = df.join(personal_status.astype('category'))
df.sex = df.sex.cat.as_ordered() # 'female' < 'male'

return standardize_dataset(df, prot_attr=['sex', age], target='credit-risk',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)
return standardize_dataset(df, prot_attr=['sex', age, 'foreign_worker'],
target='credit-risk', usecols=usecols,
dropcols=dropcols, numeric_only=numeric_only,
dropna=dropna)

def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
numeric_only=False, dropna=False):
Expand Down
8 changes: 4 additions & 4 deletions aif360/sklearn/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only',
"""
if not is_list_like(labels):
labels = [labels]
labels = [c for c in labels if isinstance(c, str)]
already_dropped = dropped_cols.intersection(labels)
if warn and already_dropped.any():
str_labels = [c for c in labels if isinstance(c, str)]
already_dropped = dropped_cols.intersection(str_labels)
if warn and any(already_dropped):
warnings.warn("Some column labels from `{}` were already dropped by "
"`{}`:\n{}".format(name, dropped_by, already_dropped.tolist()),
ColumnAlreadyDroppedWarning, stacklevel=2)
return [c for c in labels if c not in already_dropped]
return [c for c in labels if not isinstance(c, str) or c not in already_dropped]

def standardize_dataset(df, prot_attr, target, sample_weight=None, usecols=[],
dropcols=[], numeric_only=False, dropna=True):
Expand Down

0 comments on commit e82b56f

Please sign in to comment.