Skip to content

Commit

Permalink
postprocessor takes DataFrame if use_proba
Browse files Browse the repository at this point in the history
added additional tests to check this
  • Loading branch information
hoffmansc committed Feb 19, 2020
1 parent e1cf33e commit 4a84e49
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,16 @@ def predict(self, dataset, threshold=0.5):
dataset.protected_attribute_names,
self.unprivileged_groups)

priv_indices = (np.random.random(sum(cond_vec_priv))
<= self.priv_mix_rate)
priv_new_pred = dataset.scores[cond_vec_priv].copy()
priv_new_pred[priv_indices] = self.base_rate_priv

unpriv_indices = (np.random.random(sum(cond_vec_unpriv))
<= self.unpriv_mix_rate)
unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy()
unpriv_new_pred[unpriv_indices] = self.base_rate_unpriv

priv_indices = (np.random.random(sum(cond_vec_priv))
<= self.priv_mix_rate)
priv_new_pred = dataset.scores[cond_vec_priv].copy()
priv_new_pred[priv_indices] = self.base_rate_priv

dataset_new = dataset.copy(deepcopy=True)

dataset_new.scores = np.zeros_like(dataset.scores, dtype=np.float64)
Expand Down
38 changes: 19 additions & 19 deletions aif360/sklearn/postprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin):
be used as the input to this meta-estimator not the other way around.
Attributes:
estimator_: Cloned ``estimator``.
postprocessor_: Cloned ``postprocessor``.
estimator_: Fitted estimator.
postprocessor_: Fitted postprocessor.
use_proba_ (bool): Determined depending on the postprocessor type if
`use_proba` is None.
"""
Expand All @@ -49,7 +49,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
**options: Keyword options passed through to
:func:`~sklearn.model_selection.train_test_split`.
Note: 'train_size' and 'test_size' will be ignored in favor of
``val_size``.
'val_size'.
"""
self.estimator = estimator
self.postprocessor = postprocessor
Expand All @@ -70,14 +70,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
Args:
X (array-like): Training samples.
y (array-like): Training labels.
y (pandas.Series): Training labels.
sample_weight (array-like, optional): Sample weights.
**fit_params: Parameters passed to the post-processor ``fit``
**fit_params: Parameters passed to the post-processor ``fit()``
method. Note: these do not need to be prefixed with ``__``
notation.
Returns:
PostProcessingMeta: self.
self
"""
self.use_proba_ = (self.use_proba if self.use_proba is not None else
isinstance(self.postprocessor, CalibratedEqualizedOdds))
Expand Down Expand Up @@ -115,34 +115,34 @@ def fit(self, X, y, sample_weight=None, **fit_params):
def predict(self, X):
"""Predict class labels for the given samples.
First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.use_proba_`` is ``True``) then returns the post-processed output
from those predictions.
Args:
X (array-like): Test samples.
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Predicted class label per sample.
"""
y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.Series(y_pred, index=X.index)
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict(y_pred)

@if_delegate_has_method('postprocessor_')
def predict_proba(self, X):
"""Probability estimates.
First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.use_proba_`` is ``True``) then returns the post-processed output
from those predictions.
The returned estimates for all classes are ordered by the label of
classes.
Args:
X (array-like): Test samples.
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Returns the probability of the sample for each class
Expand All @@ -151,22 +151,22 @@ def predict_proba(self, X):
"""
y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.Series(y_pred, index=X.index)
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict_proba(y_pred)

@if_delegate_has_method('postprocessor_')
def predict_log_proba(self, X):
"""Log of probability estimates.
First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.use_proba_`` is ``True``) then returns the post-processed output
from those predictions.
The returned estimates for all classes are ordered by the label of
classes.
Args:
X (array-like): Test samples.
X (pandas.DataFrame): Test samples.
Returns:
array: Returns the log-probability of the sample for each class in
Expand All @@ -175,29 +175,29 @@ def predict_log_proba(self, X):
"""
y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.Series(y_pred, index=X.index)
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict_log_proba(y_pred)

@if_delegate_has_method('postprocessor_')
def score(self, X, y, sample_weight=None):
"""Returns the output of the post-processor's score function on the
given test data and labels.
First, runs ``self.estimator_.predict`` (or ``predict_proba`` if
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.use_proba_`` is ``True``) then gets the post-processed output
from those predictions and scores it.
Args:
X (array-like): Test samples.
y (array-like): True labels for ``X``.
X (pandas.DataFrame): Test samples.
y (array-like): True labels for X.
sample_weight (array-like, optional): Sample weights.
Returns:
float: Score value.
"""
y_pred = (self.estimator_.predict(X) if not self.use_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.Series(y_pred, index=X.index)
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight)


Expand Down
46 changes: 23 additions & 23 deletions aif360/sklearn/postprocessing/calibrated_equalized_odds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from aif360.sklearn.metrics import difference, base_rate
from aif360.sklearn.metrics import generalized_fnr, generalized_fpr
from aif360.sklearn.utils import check_groups
from aif360.sklearn.utils import check_inputs, check_groups


class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin):
Expand All @@ -16,9 +16,9 @@ class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin):
change output labels with an equalized odds objective [#pleiss17]_.
Note:
This breaks the sckit-learn API by requiring fit params ``y_true``,
``y_pred``, and ``pos_label`` and predict param ``y_pred``. See
:class:`PostProcessingMeta` for a workaround.
This breaks the sckit-learn API by requiring fit params y_true, y_pred,
and pos_label and predict param y_pred. See :class:`PostProcessingMeta`
for a workaround.
References:
.. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and
Expand Down Expand Up @@ -85,17 +85,20 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None):
Args:
y_pred (array-like): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent.
y_true (array-like): Ground-truth (correct) target values.
y_true (pandas.Series): Ground-truth (correct) target values.
labels (list, optional): The ordered set of labels values. Must
match the order of columns in ``y_pred`` if provided. By
default, all labels in ``y_true`` are used in sorted order.
match the order of columns in y_pred if provided. By default,
all labels in y_true are used in sorted order.
pos_label (scalar, optional): The label of the positive class.
sample_weight (array-like, optional): Sample weights.
Returns:
CalibratedEqualizedOdds: self.
self
"""
groups, self.prot_attr_ = check_groups(y_true, self.prot_attr)
y_pred, y_true, sample_weight = check_inputs(y_pred, y_true,
sample_weight)
groups, self.prot_attr_ = check_groups(y_true, self.prot_attr,
ensure_binary=True)
self.classes_ = labels if labels is not None else np.unique(y_true)
self.groups_ = np.unique(groups)
self.pos_label_ = pos_label
Expand All @@ -107,20 +110,14 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None):
raise ValueError('pos_label={} is not in the set of labels. The '
'valid values are:\n{}'.format(pos_label, self.classes_))

if len(self.groups_) != 2:
raise ValueError('prot_attr={}\nyielded {} groups:\n{}\nbut this '
'algorithm requires a binary division of the data.'.format(
self.prot_attr_, len(self.groups_), self.groups_))

y_pred = y_pred[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]]

# local function to return corresponding args for metric evaluation
def _args(grp_idx, triv=False):
idx = (groups == self.groups_[grp_idx])
pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else
y_pred)
return [y_true[idx], pred[idx], pos_label,
sample_weight[idx] if sample_weight is not None else None]
return [y_true[idx], pred[idx], pos_label, sample_weight[idx]]

self.base_rates_ = [base_rate(*_args(i)) for i in range(2)]

Expand All @@ -138,8 +135,9 @@ def predict_proba(self, y_pred):
classes.
Args:
y_pred (array-like): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent.
y_pred (pandas.DataFrame): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent. Note: must
include protected attributes in the index.
Returns:
numpy.ndarray: Returns the probability of the sample for each class
Expand All @@ -156,7 +154,7 @@ def predict_proba(self, y_pred):
np.unique(groups), self.groups_))

pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0]
y_pred = y_pred[:, pos_idx]
y_pred = y_pred.iloc[:, pos_idx]

yt = np.empty_like(y_pred)
for grp_idx in range(2):
Expand All @@ -172,8 +170,9 @@ def predict(self, y_pred):
"""Predict class labels for the given scores.
Args:
y_pred (array-like): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent.
y_pred (pandas.DataFrame): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent. Note: must
include protected attributes in the index.
Returns:
numpy.ndarray: Predicted class label per sample.
Expand All @@ -185,8 +184,9 @@ def score(self, y_pred, y_true, sample_weight=None):
"""Score the predictions according to the cost constraint specified.
Args:
y_pred (array-like): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent.
y_pred (pandas.DataFrame): Probability estimates of the targets as
returned by a ``predict_proba()`` call or equivalent. Note: must
include protected attributes in the index.
y_true (array-like): Ground-truth (correct) target values.
sample_weight (array-like, optional): Sample weights.
Expand Down
46 changes: 30 additions & 16 deletions tests/sklearn/test_calibrated_equalized_odds.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss',
'hours-per-week'], features_to_drop=[])

def test_calib_eq_odds_sex():
def test_calib_eq_odds_sex_weighted():
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X)
adult_pred = adult.copy()
Expand All @@ -27,31 +27,45 @@ def test_calib_eq_odds_sex():
assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1])
assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0])

def test_split():
adult_est, adult_post = adult.split([0.75], shuffle=False)
X_est, X_post, y_est, y_post = train_test_split(X, y, shuffle=False)
def test_postprocessingmeta_fnr():
adult_train, adult_test = adult.split([0.9], shuffle=False)
X_tr, X_te, y_tr, _, sw_tr, _ = train_test_split(X, y, sample_weight,
train_size=0.9, shuffle=False)

assert np.all(adult_est.features == X_est)
assert np.all(adult_est.labels.ravel() == y_est)
assert np.all(adult_post.features == X_post)
assert np.all(adult_post.labels.ravel() == y_post)
assert np.all(adult_train.features == X_tr)
assert np.all(adult_test.features == X_te)
assert np.all(adult_train.labels.ravel() == y_tr)

adult_est, adult_post = adult_train.split([0.75], shuffle=False)

def test_postprocessingmeta():
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(adult_est.features, adult_est.labels.ravel(),
sample_weight=adult_est.instance_weights)
probas_pred = logreg.predict_proba(adult_post.features)[:, 1]

adult_est, adult_post = adult.split([0.75], shuffle=False)
logreg.fit(adult_est.features, adult_est.labels.ravel())
y_pred = logreg.predict_proba(adult_post.features)[:, 1]
adult_pred = adult_post.copy()
adult_pred.scores = y_pred
adult_pred.scores = probas_pred

orig_cal_eq_odds = CalibratedEqOddsPostprocessing(
unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}])
unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}],
cost_constraint='fnr', seed=0)
orig_cal_eq_odds.fit(adult_post, adult_pred)

cal_eq_odds = PostProcessingMeta(estimator=logreg,
postprocessor=CalibratedEqualizedOdds('sex'), shuffle=False)
cal_eq_odds.fit(X, y, sample_weight=sample_weight)
postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=0),
shuffle=False)
cal_eq_odds.fit(X_tr, y_tr, sample_weight=sw_tr)

assert np.allclose(logreg.coef_, cal_eq_odds.estimator_.coef_)

assert np.allclose([orig_cal_eq_odds.unpriv_mix_rate,
orig_cal_eq_odds.priv_mix_rate],
cal_eq_odds.postprocessor_.mix_rates_)

adult_test_pred = adult_test.copy()
adult_test_pred.scores = logreg.predict_proba(adult_test.features)[:, 1]
adult_test_pred = orig_cal_eq_odds.predict(adult_test_pred)

y_test_pred = cal_eq_odds.predict_proba(X_te)

assert np.allclose(adult_test_pred.scores, y_test_pred[:, 1])

0 comments on commit 4a84e49

Please sign in to comment.