fixed/renamed/reordered/added some attributes

* fixed German 'age' from being dropped * renamed two_year_recid labels to 'Survived' and 'Recidivated' to match ProPublica article * reordered COMPAS categories to 'Male' < 'Female' * added 'foreign_worker' protected attribute for German
Milkigit · Feb 19, 2020 · e82b56f · e82b56f
1 parent af3c824
commit e82b56f
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 16 deletions.
diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py
@@ -20,8 +20,12 @@ def fetch_compas(data_home=None, binary_race=False,
     Optionally binarizes 'race' to 'Caucasian' (privileged) or
     'African-American' (unprivileged). The other protected attribute is 'sex'
     ('Male' is *unprivileged* and 'Female' is *privileged*). The outcome
-    variable is 'no recid.' (favorable) if the person was not accused of a crime
-    within two years or 'did recid.' (unfavorable) if they were.
+    variable is 'Survived' (favorable) if the person was not accused of a crime
+    within two years or 'Recidivated' (unfavorable) if they were.
+
+    Note:
+        The values for the 'sex' variable if numeric_only is ``True`` are 1 for
+        'Female and 0 for 'Male' -- opposite the convention of other datasets.
 
     Args:
         data_home (string, optional): Specify another download and cache folder
@@ -59,16 +63,19 @@ def fetch_compas(data_home=None, binary_race=False,
     for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']:
         df[col] = df[col].astype('category')
 
-    # 'did recid' < 'no recid'
-    df.two_year_recid = df.two_year_recid.replace({0: 'no recid.',
-            1: 'did recid.'}).astype('category').cat.as_ordered()
+    # 'Survived' < 'Recidivated'
+    cats = ['Survived', 'Recidivated']
+    df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category')
+    df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True)
 
     if binary_race:
         # 'African-American' < 'Caucasian'
         df.race = df.race.cat.set_categories(['African-American', 'Caucasian'],
                                              ordered=True)
 
-    df.sex = df.sex.astype('category').cat.as_ordered()  # 'Female' < 'Male'
+    # 'Male' < 'Female'
+    df.sex = df.sex.astype('category').cat.reorder_categories(
+            ['Male', 'Female'], ordered=True)
 
     return standardize_dataset(df, prot_attr=['sex', 'race'],
                                target='two_year_recid', usecols=usecols,

diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py
@@ -36,8 +36,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
                 dropcols=[], numeric_only=False, dropna=True):
     """Load the Adult Census Income Dataset.
 
-    Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged).
-    The other protected attribute is 'sex' ('Male' is privileged and 'Female' is
+    Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). The
+    other protected attribute is 'sex' ('Male' is privileged and 'Female' is
     unprivileged). The outcome variable is 'annual-income': '>50K' (favorable)
     or '<=50K' (unfavorable).
 
@@ -151,7 +151,8 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
     df['credit-risk'] = df['credit-risk'].cat.as_ordered()  # 'bad' < 'good'
 
     # binarize protected attribute (but not corresponding feature)
-    age = (pd.cut(df.age, [0, 25, 100], labels=numeric_only and ['young', 'aged'])
+    age = (pd.cut(df.age, [0, 25, 100],
+                  labels=False if numeric_only else ['young', 'aged'])
            if binary_age else 'age')
 
     # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female'
@@ -161,9 +162,10 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
     df = df.join(personal_status.astype('category'))
     df.sex = df.sex.cat.as_ordered()  # 'female' < 'male'
 
-    return standardize_dataset(df, prot_attr=['sex', age], target='credit-risk',
-                               usecols=usecols, dropcols=dropcols,
-                               numeric_only=numeric_only, dropna=dropna)
+    return standardize_dataset(df, prot_attr=['sex', age, 'foreign_worker'],
+                               target='credit-risk', usecols=usecols,
+                               dropcols=dropcols, numeric_only=numeric_only,
+                               dropna=dropna)
 
 def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
                numeric_only=False, dropna=False):

diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py
@@ -28,13 +28,13 @@ def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only',
     """
     if not is_list_like(labels):
         labels = [labels]
-    labels = [c for c in labels if isinstance(c, str)]
-    already_dropped = dropped_cols.intersection(labels)
-    if warn and already_dropped.any():
+    str_labels = [c for c in labels if isinstance(c, str)]
+    already_dropped = dropped_cols.intersection(str_labels)
+    if warn and any(already_dropped):
         warnings.warn("Some column labels from `{}` were already dropped by "
                 "`{}`:\n{}".format(name, dropped_by, already_dropped.tolist()),
                 ColumnAlreadyDroppedWarning, stacklevel=2)
-    return [c for c in labels if c not in already_dropped]
+    return [c for c in labels if not isinstance(c, str) or c not in already_dropped]
 
 def standardize_dataset(df, prot_attr, target, sample_weight=None, usecols=[],
                        dropcols=[], numeric_only=False, dropna=True):