Skip to content

Commit

Permalink
ENH Support unknown_value=np.nan in OrdinalEncoder (scikit-learn#18406)
Browse files Browse the repository at this point in the history
  • Loading branch information
NicolasHug authored Sep 23, 2020
1 parent f3b64db commit 4aada4e
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 16 deletions.
3 changes: 2 additions & 1 deletion doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,8 @@ Changelog
``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
transform and set the encoded value of the unknown categories.
:pr:`17406` by :user:`Felix Wick <FelixWick>`.
:pr:`17406` by :user:`Felix Wick <FelixWick>` and :pr:`18406` by
`Nicolas Hug`_.

- |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`,
which clips the transformed values of test data to ``feature_range``.
Expand Down
28 changes: 19 additions & 9 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numbers

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils import check_array, is_scalar_nan
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args

Expand Down Expand Up @@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder):
.. versionadded:: 0.24
unknown_value : int, default=None
unknown_value : int or np.nan, default=None
When the parameter handle_unknown is set to 'use_encoded_value', this
parameter is required and will set the encoded value of unknown
categories. It has to be distinct from the values used to encode any of
the categories in `fit`.
the categories in `fit`. If set to np.nan, the `dtype` parameter must
be a float dtype.
.. versionadded:: 0.24
Expand Down Expand Up @@ -699,13 +700,21 @@ def fit(self, X, y=None):
self
"""
if self.handle_unknown == 'use_encoded_value':
if not isinstance(self.unknown_value, numbers.Integral):
raise TypeError(f"unknown_value should be an integer when "
f"`handle_unknown is 'use_encoded_value'`, "
if is_scalar_nan(self.unknown_value):
if np.dtype(self.dtype).kind != 'f':
raise ValueError(
f"When unknown_value is np.nan, the dtype "
"parameter should be "
f"a float dtype. Got {self.dtype}."
)
elif not isinstance(self.unknown_value, numbers.Integral):
raise TypeError(f"unknown_value should be an integer or "
f"np.nan when "
f"handle_unknown is 'use_encoded_value', "
f"got {self.unknown_value}.")
elif self.unknown_value is not None:
raise TypeError(f"unknown_value should only be set when "
f"`handle_unknown is 'use_encoded_value'`, "
f"handle_unknown is 'use_encoded_value', "
f"got {self.unknown_value}.")

self._fit(X)
Expand Down Expand Up @@ -735,11 +744,12 @@ def transform(self, X):
Transformed input.
"""
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
X_trans = X_int.astype(self.dtype, copy=False)

# create separate category for unknown values
if self.handle_unknown == 'use_encoded_value':
X_int[~X_mask] = self.unknown_value
return X_int.astype(self.dtype, copy=False)
X_trans[~X_mask] = self.unknown_value
return X_trans

def inverse_transform(self, X):
"""
Expand Down
36 changes: 30 additions & 6 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise():
X = np.array([['a', 'x'], ['b', 'y']], dtype=object)

enc = OrdinalEncoder(handle_unknown='use_encoded_value')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got None.")
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got None.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(unknown_value=-2)
msg = ("unknown_value should only be set when `handle_unknown is "
"'use_encoded_value'`, got -2.")
msg = ("unknown_value should only be set when handle_unknown is "
"'use_encoded_value', got -2.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value='bla')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got bla.")
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got bla.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

Expand All @@ -614,6 +614,30 @@ def test_ordinal_encoder_handle_unknowns_raise():
enc.fit(X)


def test_ordinal_encoder_handle_unknowns_nan():
# Make sure unknown_value=np.nan properly works

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=np.nan)

X_fit = np.array([[1], [2], [3]])
enc.fit(X_fit)
X_trans = enc.transform([[1], [2], [4]])
assert_array_equal(X_trans, [[0], [1], [np.nan]])


def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
# Make sure an error is raised when unknown_value=np.nan and the dtype
# isn't a float dtype
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=np.nan, dtype=int)

X_fit = np.array([[1], [2], [3]])
with pytest.raises(ValueError,
match="dtype parameter should be a float dtype"):
enc.fit(X_fit)


def test_ordinal_encoder_raise_categories_shape():

X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
Expand Down

0 comments on commit 4aada4e

Please sign in to comment.