Skip to content

Commit

Permalink
BUG: repr of Categorical does not distinguish int and str. (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli authored Jun 24, 2020
1 parent 314ac9a commit db48799
Show file tree
Hide file tree
Showing 15 changed files with 138 additions and 114 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,7 @@ Categorical
- Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`)
- Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`)
- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`)
- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`)

Datetimelike
^^^^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,8 +604,8 @@ def factorize(
>>> codes
array([0, 0, 1]...)
>>> uniques
[a, c]
Categories (3, object): [a, b, c]
['a', 'c']
Categories (3, object): ['a', 'b', 'c']
Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,14 +846,14 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"
--------
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat(2)
[a, a, b, b, c, c]
Categories (3, object): [a, b, c]
['a', 'a', 'b', 'b', 'c', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat([1, 2, 3])
[a, b, b, c, c, c]
Categories (3, object): [a, b, c]
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['a', 'b', 'c']
"""

@Substitution(klass="ExtensionArray")
Expand Down
105 changes: 55 additions & 50 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from csv import QUOTE_NONNUMERIC
from functools import partial
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Type, Union, cast
Expand Down Expand Up @@ -275,17 +277,17 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject):
Categories (3, int64): [1, 2, 3]
>>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
Ordered `Categoricals` can be sorted according to the custom order
of the categories and can have a min and max value.
>>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
... categories=['c', 'b', 'a'])
>>> c
[a, b, c, a, b, c]
Categories (3, object): [c < b < a]
['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['c' < 'b' < 'a']
>>> c.min()
'c'
"""
Expand Down Expand Up @@ -598,8 +600,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
--------
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
[a, b, a, b]
Categories (2, object): [a < b]
['a', 'b', 'a', 'b']
Categories (2, object): ['a' < 'b']
"""
dtype = CategoricalDtype._from_values_or_dtype(
categories=categories, ordered=ordered, dtype=dtype
Expand Down Expand Up @@ -659,13 +661,13 @@ def _set_categories(self, categories, fastpath=False):
--------
>>> c = pd.Categorical(['a', 'b'])
>>> c
[a, b]
Categories (2, object): [a, b]
['a', 'b']
Categories (2, object): ['a', 'b']
>>> c._set_categories(pd.Index(['a', 'c']))
>>> c
[a, c]
Categories (2, object): [a, c]
['a', 'c']
Categories (2, object): ['a', 'c']
"""
if fastpath:
new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
Expand Down Expand Up @@ -885,14 +887,14 @@ def rename_categories(self, new_categories, inplace=False):
categories not in the dictionary are passed through
>>> c.rename_categories({'a': 'A', 'c': 'C'})
[A, A, b]
Categories (2, object): [A, b]
['A', 'A', 'b']
Categories (2, object): ['A', 'b']
You may also provide a callable to create the new categories
>>> c.rename_categories(lambda x: x.upper())
[A, A, B]
Categories (2, object): [A, B]
['A', 'A', 'B']
Categories (2, object): ['A', 'B']
"""
inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()
Expand Down Expand Up @@ -1128,22 +1130,22 @@ def map(self, mapper):
--------
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.map(lambda x: x.upper())
[A, B, C]
Categories (3, object): [A, B, C]
['A', 'B', 'C']
Categories (3, object): ['A', 'B', 'C']
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
[first, second, third]
Categories (3, object): [first, second, third]
['first', 'second', 'third']
Categories (3, object): ['first', 'second', 'third']
If the mapping is one-to-one the ordering of the categories is
preserved:
>>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
>>> cat
[a, b, c]
Categories (3, object): [a < b < c]
['a', 'b', 'c']
Categories (3, object): ['a' < 'b' < 'c']
>>> cat.map({'a': 3, 'b': 2, 'c': 1})
[3, 2, 1]
Categories (3, int64): [3 < 2 < 1]
Expand Down Expand Up @@ -1778,29 +1780,29 @@ def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T:
--------
>>> cat = pd.Categorical(['a', 'a', 'b'])
>>> cat
[a, a, b]
Categories (2, object): [a, b]
['a', 'a', 'b']
Categories (2, object): ['a', 'b']
Specify ``allow_fill==False`` to have negative indices mean indexing
from the right.
>>> cat.take([0, -1, -2], allow_fill=False)
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
values that should be filled with the `fill_value`, which is
``np.nan`` by default.
>>> cat.take([0, -1, -1], allow_fill=True)
[a, NaN, NaN]
Categories (2, object): [a, b]
['a', NaN, NaN]
Categories (2, object): ['a', 'b']
The fill value can be specified.
>>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
[a, a, a]
Categories (2, object): [a, b]
['a', 'a', 'a']
Categories (2, object): ['a', 'b']
Specifying a fill value that's not in ``self.categories``
will raise a ``ValueError``.
Expand Down Expand Up @@ -1872,13 +1874,16 @@ def _repr_categories(self):
)
from pandas.io.formats import format as fmt

format_array = partial(
fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
)
if len(self.categories) > max_categories:
num = max_categories // 2
head = fmt.format_array(self.categories[:num], None)
tail = fmt.format_array(self.categories[-num:], None)
head = format_array(self.categories[:num])
tail = format_array(self.categories[-num:])
category_strs = head + ["..."] + tail
else:
category_strs = fmt.format_array(self.categories, None)
category_strs = format_array(self.categories)

# Strip all leading spaces, which format_array adds for columns...
category_strs = [x.strip() for x in category_strs]
Expand Down Expand Up @@ -2051,8 +2056,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
--------
>>> c = pd.Categorical(list('aabca'))
>>> c
[a, a, b, c, a]
Categories (3, object): [a, b, c]
['a', 'a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']
>>> c.categories
Index(['a', 'b', 'c'], dtype='object')
>>> c.codes
Expand Down Expand Up @@ -2199,20 +2204,20 @@ def unique(self):
order of appearance.
>>> pd.Categorical(list("baabc")).unique()
[b, a, c]
Categories (3, object): [b, a, c]
['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']
>>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
[b, a, c]
Categories (3, object): [b, a, c]
['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']
An ordered Categorical preserves the category ordering.
>>> pd.Categorical(
... list("baabc"), categories=list("abc"), ordered=True
... ).unique()
[b, a, c]
Categories (3, object): [a < b < c]
['b', 'a', 'c']
Categories (3, object): ['a' < 'b' < 'c']
"""
# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
Expand Down Expand Up @@ -2465,7 +2470,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> s.cat.categories
Index(['a', 'b', 'c'], dtype='object')
Expand All @@ -2478,7 +2483,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 a
5 a
dtype: category
Categories (3, object): [c, b, a]
Categories (3, object): ['c', 'b', 'a']
>>> s.cat.reorder_categories(list("cba"))
0 a
Expand All @@ -2488,7 +2493,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [c, b, a]
Categories (3, object): ['c', 'b', 'a']
>>> s.cat.add_categories(["d", "e"])
0 a
Expand All @@ -2498,7 +2503,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (5, object): [a, b, c, d, e]
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
>>> s.cat.remove_categories(["a", "c"])
0 NaN
Expand All @@ -2508,7 +2513,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 NaN
5 NaN
dtype: category
Categories (1, object): [b]
Categories (1, object): ['b']
>>> s1 = s.cat.add_categories(["d", "e"])
>>> s1.cat.remove_unused_categories()
Expand All @@ -2519,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> s.cat.set_categories(list("abcde"))
0 a
Expand All @@ -2529,7 +2534,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (5, object): [a, b, c, d, e]
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
>>> s.cat.as_ordered()
0 a
Expand All @@ -2539,7 +2544,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a < b < c]
Categories (3, object): ['a' < 'b' < 'c']
>>> s.cat.as_unordered()
0 a
Expand All @@ -2549,7 +2554,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
"""

def __init__(self, data):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ def array(self) -> ExtensionArray:
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.array
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
"""
raise AbstractMethodError(self)

Expand Down Expand Up @@ -1481,8 +1481,8 @@ def factorize(self, sort=False, na_sentinel=-1):
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
... )
>>> ser
[apple, bread, bread, cheese, milk]
Categories (4, object): [apple < bread < cheese < milk]
['apple', 'bread', 'bread', 'cheese', 'milk']
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
>>> ser.searchsorted('bread')
1
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,15 @@ def array(
You can use the string alias for `dtype`
>>> pd.array(['a', 'b', 'a'], dtype='category')
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
Or specify the actual dtype
>>> pd.array(['a', 'b', 'a'],
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
[a, b, a]
Categories (3, object): [a < b < c]
['a', 'b', 'a']
Categories (3, object): ['a' < 'b' < 'c']
If pandas does not infer a dedicated extension type a
:class:`arrays.PandasArray` is returned.
Expand Down Expand Up @@ -357,8 +357,8 @@ def extract_array(obj, extract_numpy: bool = False):
Examples
--------
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
Other objects like lists, arrays, and DataFrames are just passed through.
Expand Down
Loading

0 comments on commit db48799

Please sign in to comment.