postprocessor takes DataFrame if use_proba

added additional tests to check this
firmai · Feb 19, 2020 · 4a84e49 · 4a84e49
1 parent e1cf33e
commit 4a84e49
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 63 deletions.
diff --git a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py
@@ -171,16 +171,16 @@ def predict(self, dataset, threshold=0.5):
             dataset.protected_attribute_names,
             self.unprivileged_groups)
 
-        priv_indices = (np.random.random(sum(cond_vec_priv))
-                     <= self.priv_mix_rate)
-        priv_new_pred = dataset.scores[cond_vec_priv].copy()
-        priv_new_pred[priv_indices] = self.base_rate_priv
-
         unpriv_indices = (np.random.random(sum(cond_vec_unpriv))
                        <= self.unpriv_mix_rate)
         unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy()
         unpriv_new_pred[unpriv_indices] = self.base_rate_unpriv
 
+        priv_indices = (np.random.random(sum(cond_vec_priv))
+                     <= self.priv_mix_rate)
+        priv_new_pred = dataset.scores[cond_vec_priv].copy()
+        priv_new_pred[priv_indices] = self.base_rate_priv
+
         dataset_new = dataset.copy(deepcopy=True)
 
         dataset_new.scores = np.zeros_like(dataset.scores, dtype=np.float64)

diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py
@@ -26,8 +26,8 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin):
         be used as the input to this meta-estimator not the other way around.
 
     Attributes:
-        estimator_: Cloned ``estimator``.
-        postprocessor_: Cloned ``postprocessor``.
+        estimator_: Fitted estimator.
+        postprocessor_: Fitted postprocessor.
         use_proba_ (bool): Determined depending on the postprocessor type if
             `use_proba` is None.
     """
@@ -49,7 +49,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
             **options: Keyword options passed through to
                 :func:`~sklearn.model_selection.train_test_split`.
                 Note: 'train_size' and 'test_size' will be ignored in favor of
-                ``val_size``.
+                'val_size'.
         """
         self.estimator = estimator
         self.postprocessor = postprocessor
@@ -70,14 +70,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
 
         Args:
             X (array-like): Training samples.
-            y (array-like): Training labels.
+            y (pandas.Series): Training labels.
             sample_weight (array-like, optional): Sample weights.
-            **fit_params: Parameters passed to the post-processor ``fit``
+            **fit_params: Parameters passed to the post-processor ``fit()``
                 method. Note: these do not need to be prefixed with ``__``
                 notation.
 
         Returns:
-            PostProcessingMeta: self.
+            self
         """
         self.use_proba_ = (self.use_proba if self.use_proba is not None else
                 isinstance(self.postprocessor, CalibratedEqualizedOdds))
@@ -115,34 +115,34 @@ def fit(self, X, y, sample_weight=None, **fit_params):
     def predict(self, X):
         """Predict class labels for the given samples.
 
-        First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
+        First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
         ``self.use_proba_`` is ``True``) then returns the post-processed output
         from those predictions.
 
         Args:
-            X (array-like): Test samples.
+            X (pandas.DataFrame): Test samples.
 
         Returns:
             numpy.ndarray: Predicted class label per sample.
         """
         y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
                   self.estimator_.predict_proba(X))
-        y_pred = pd.Series(y_pred, index=X.index)
+        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
         return self.postprocessor_.predict(y_pred)
 
     @if_delegate_has_method('postprocessor_')
     def predict_proba(self, X):
         """Probability estimates.
 
-        First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
+        First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
         ``self.use_proba_`` is ``True``) then returns the post-processed output
         from those predictions.
 
         The returned estimates for all classes are ordered by the label of
         classes.
 
         Args:
-            X (array-like): Test samples.
+            X (pandas.DataFrame): Test samples.
 
         Returns:
             numpy.ndarray: Returns the probability of the sample for each class
@@ -151,22 +151,22 @@ def predict_proba(self, X):
         """
         y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
                   self.estimator_.predict_proba(X))
-        y_pred = pd.Series(y_pred, index=X.index)
+        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
         return self.postprocessor_.predict_proba(y_pred)
 
     @if_delegate_has_method('postprocessor_')
     def predict_log_proba(self, X):
         """Log of probability estimates.
 
-        First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
+        First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
         ``self.use_proba_`` is ``True``) then returns the post-processed output
         from those predictions.
 
         The returned estimates for all classes are ordered by the label of
         classes.
 
         Args:
-            X (array-like): Test samples.
+            X (pandas.DataFrame): Test samples.
 
         Returns:
             array: Returns the log-probability of the sample for each class in
@@ -175,29 +175,29 @@ def predict_log_proba(self, X):
         """
         y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
                   self.estimator_.predict_proba(X))
-        y_pred = pd.Series(y_pred, index=X.index)
+        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
         return self.postprocessor_.predict_log_proba(y_pred)
 
     @if_delegate_has_method('postprocessor_')
     def score(self, X, y, sample_weight=None):
         """Returns the output of the post-processor's score function on the
         given test data and labels.
 
-        First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
+        First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
         ``self.use_proba_`` is ``True``) then gets the post-processed output
         from those predictions and scores it.
 
         Args:
-            X (array-like): Test samples.
-            y (array-like): True labels for ``X``.
+            X (pandas.DataFrame): Test samples.
+            y (array-like): True labels for X.
             sample_weight (array-like, optional): Sample weights.
 
         Returns:
             float: Score value.
         """
         y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
                   self.estimator_.predict_proba(X))
-        y_pred = pd.Series(y_pred, index=X.index)
+        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
         return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight)
 
 

diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py
@@ -5,7 +5,7 @@
 
 from aif360.sklearn.metrics import difference, base_rate
 from aif360.sklearn.metrics import generalized_fnr, generalized_fpr
-from aif360.sklearn.utils import check_groups
+from aif360.sklearn.utils import check_inputs, check_groups
 
 
 class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin):
@@ -16,9 +16,9 @@ class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin):
     change output labels with an equalized odds objective [#pleiss17]_.
 
     Note:
-        This breaks the sckit-learn API by requiring fit params ``y_true``,
-        ``y_pred``, and ``pos_label`` and predict param ``y_pred``. See
-        :class:`PostProcessingMeta` for a workaround.
+        This breaks the sckit-learn API by requiring fit params y_true, y_pred,
+        and pos_label and predict param y_pred. See :class:`PostProcessingMeta`
+        for a workaround.
 
     References:
         .. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and
@@ -85,17 +85,20 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None):
         Args:
             y_pred (array-like): Probability estimates of the targets as
                 returned by a ``predict_proba()`` call or equivalent.
-            y_true (array-like): Ground-truth (correct) target values.
+            y_true (pandas.Series): Ground-truth (correct) target values.
             labels (list, optional): The ordered set of labels values. Must
-                match the order of columns in ``y_pred`` if provided. By
-                default, all labels in ``y_true`` are used in sorted order.
+                match the order of columns in y_pred if provided. By default,
+                all labels in y_true are used in sorted order.
             pos_label (scalar, optional): The label of the positive class.
             sample_weight (array-like, optional): Sample weights.
 
         Returns:
-            CalibratedEqualizedOdds: self.
+            self
         """
-        groups, self.prot_attr_ = check_groups(y_true, self.prot_attr)
+        y_pred, y_true, sample_weight = check_inputs(y_pred, y_true,
+                                                     sample_weight)
+        groups, self.prot_attr_ = check_groups(y_true, self.prot_attr,
+                                               ensure_binary=True)
         self.classes_ = labels if labels is not None else np.unique(y_true)
         self.groups_ = np.unique(groups)
         self.pos_label_ = pos_label
@@ -107,20 +110,14 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None):
             raise ValueError('pos_label={} is not in the set of labels. The '
                     'valid values are:\n{}'.format(pos_label, self.classes_))
 
-        if len(self.groups_) != 2:
-            raise ValueError('prot_attr={}\nyielded {} groups:\n{}\nbut this '
-                    'algorithm requires a binary division of the data.'.format(
-                            self.prot_attr_, len(self.groups_), self.groups_))
-
         y_pred = y_pred[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]]
 
         # local function to return corresponding args for metric evaluation
         def _args(grp_idx, triv=False):
             idx = (groups == self.groups_[grp_idx])
             pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else
                     y_pred)
-            return [y_true[idx], pred[idx], pos_label,
-                    sample_weight[idx] if sample_weight is not None else None]
+            return [y_true[idx], pred[idx], pos_label, sample_weight[idx]]
 
         self.base_rates_ = [base_rate(*_args(i)) for i in range(2)]
 
@@ -138,8 +135,9 @@ def predict_proba(self, y_pred):
         classes.
 
         Args:
-            y_pred (array-like): Probability estimates of the targets as
-                returned by a ``predict_proba()`` call or equivalent.
+            y_pred (pandas.DataFrame): Probability estimates of the targets as
+                returned by a ``predict_proba()`` call or equivalent. Note: must
+                include protected attributes in the index.
 
         Returns:
             numpy.ndarray: Returns the probability of the sample for each class
@@ -156,7 +154,7 @@ def predict_proba(self, y_pred):
                                      np.unique(groups), self.groups_))
 
         pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0]
-        y_pred = y_pred[:, pos_idx]
+        y_pred = y_pred.iloc[:, pos_idx]
 
         yt = np.empty_like(y_pred)
         for grp_idx in range(2):
@@ -172,8 +170,9 @@ def predict(self, y_pred):
         """Predict class labels for the given scores.
 
         Args:
-            y_pred (array-like): Probability estimates of the targets as
-                returned by a ``predict_proba()`` call or equivalent.
+            y_pred (pandas.DataFrame): Probability estimates of the targets as
+                returned by a ``predict_proba()`` call or equivalent. Note: must
+                include protected attributes in the index.
 
         Returns:
             numpy.ndarray: Predicted class label per sample.
@@ -185,8 +184,9 @@ def score(self, y_pred, y_true, sample_weight=None):
         """Score the predictions according to the cost constraint specified.
 
         Args:
-            y_pred (array-like): Probability estimates of the targets as
-                returned by a ``predict_proba()`` call or equivalent.
+            y_pred (pandas.DataFrame): Probability estimates of the targets as
+                returned by a ``predict_proba()`` call or equivalent. Note: must
+                include protected attributes in the index.
             y_true (array-like): Ground-truth (correct) target values.
             sample_weight (array-like, optional): Sample weights.
 

diff --git a/tests/sklearn/test_calibrated_equalized_odds.py b/tests/sklearn/test_calibrated_equalized_odds.py
@@ -13,7 +13,7 @@
         features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss',
                           'hours-per-week'], features_to_drop=[])
 
-def test_calib_eq_odds_sex():
+def test_calib_eq_odds_sex_weighted():
     logreg = LogisticRegression(solver='lbfgs', max_iter=500)
     y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X)
     adult_pred = adult.copy()
@@ -27,31 +27,45 @@ def test_calib_eq_odds_sex():
     assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1])
     assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0])
 
-def test_split():
-    adult_est, adult_post = adult.split([0.75], shuffle=False)
-    X_est, X_post, y_est, y_post = train_test_split(X, y, shuffle=False)
+def test_postprocessingmeta_fnr():
+    adult_train, adult_test = adult.split([0.9], shuffle=False)
+    X_tr, X_te, y_tr, _, sw_tr, _ = train_test_split(X, y, sample_weight,
+                train_size=0.9, shuffle=False)
 
-    assert np.all(adult_est.features == X_est)
-    assert np.all(adult_est.labels.ravel() == y_est)
-    assert np.all(adult_post.features == X_post)
-    assert np.all(adult_post.labels.ravel() == y_post)
+    assert np.all(adult_train.features == X_tr)
+    assert np.all(adult_test.features == X_te)
+    assert np.all(adult_train.labels.ravel() == y_tr)
+
+    adult_est, adult_post = adult_train.split([0.75], shuffle=False)
 
-def test_postprocessingmeta():
     logreg = LogisticRegression(solver='lbfgs', max_iter=500)
+    logreg.fit(adult_est.features, adult_est.labels.ravel(),
+               sample_weight=adult_est.instance_weights)
+    probas_pred = logreg.predict_proba(adult_post.features)[:, 1]
 
-    adult_est, adult_post = adult.split([0.75], shuffle=False)
-    logreg.fit(adult_est.features, adult_est.labels.ravel())
-    y_pred = logreg.predict_proba(adult_post.features)[:, 1]
     adult_pred = adult_post.copy()
-    adult_pred.scores = y_pred
+    adult_pred.scores = probas_pred
+
     orig_cal_eq_odds = CalibratedEqOddsPostprocessing(
-            unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}])
+            unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}],
+            cost_constraint='fnr', seed=0)
     orig_cal_eq_odds.fit(adult_post, adult_pred)
 
     cal_eq_odds = PostProcessingMeta(estimator=logreg,
-            postprocessor=CalibratedEqualizedOdds('sex'), shuffle=False)
-    cal_eq_odds.fit(X, y, sample_weight=sample_weight)
+            postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=0),
+            shuffle=False)
+    cal_eq_odds.fit(X_tr, y_tr, sample_weight=sw_tr)
+
+    assert np.allclose(logreg.coef_, cal_eq_odds.estimator_.coef_)
 
     assert np.allclose([orig_cal_eq_odds.unpriv_mix_rate,
                         orig_cal_eq_odds.priv_mix_rate],
                        cal_eq_odds.postprocessor_.mix_rates_)
+
+    adult_test_pred = adult_test.copy()
+    adult_test_pred.scores = logreg.predict_proba(adult_test.features)[:, 1]
+    adult_test_pred = orig_cal_eq_odds.predict(adult_test_pred)
+
+    y_test_pred = cal_eq_odds.predict_proba(X_te)
+
+    assert np.allclose(adult_test_pred.scores, y_test_pred[:, 1])