Skip to content

Commit 3ccbbfd

Browse files
authoredFeb 25, 2021
PERF: make Categorical _ndarray, attribute, _codes property (pandas-dev#40033)
1 parent 6db33da commit 3ccbbfd

File tree

2 files changed

+15
-11
lines changed

2 files changed

+15
-11
lines changed
 

‎pandas/core/arrays/categorical.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def __init__(
373373
# infer categories in a factorization step further below
374374

375375
if fastpath:
376-
self._codes = coerce_indexer_dtype(values, dtype.categories)
376+
self._ndarray = coerce_indexer_dtype(values, dtype.categories)
377377
self._dtype = self._dtype.update_dtype(dtype)
378378
return
379379

@@ -450,7 +450,7 @@ def __init__(
450450
codes = full_codes
451451

452452
self._dtype = self._dtype.update_dtype(dtype)
453-
self._codes = coerce_indexer_dtype(codes, dtype.categories)
453+
self._ndarray = coerce_indexer_dtype(codes, dtype.categories)
454454

455455
@property
456456
def dtype(self) -> CategoricalDtype:
@@ -923,7 +923,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
923923
codes = recode_for_categories(
924924
cat.codes, cat.categories, new_dtype.categories
925925
)
926-
cat._codes = codes
926+
cat._ndarray = codes
927927
cat._dtype = new_dtype
928928

929929
if not inplace:
@@ -1096,7 +1096,7 @@ def add_categories(self, new_categories, inplace=False):
10961096

10971097
cat = self if inplace else self.copy()
10981098
cat._dtype = new_dtype
1099-
cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
1099+
cat._ndarray = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
11001100
if not inplace:
11011101
return cat
11021102

@@ -1201,7 +1201,7 @@ def remove_unused_categories(self, inplace=no_default):
12011201
new_categories, ordered=self.ordered
12021202
)
12031203
cat._dtype = new_dtype
1204-
cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
1204+
cat._ndarray = coerce_indexer_dtype(inv, new_dtype.categories)
12051205

12061206
if not inplace:
12071207
return cat
@@ -1384,6 +1384,10 @@ def __setstate__(self, state):
13841384
if "_dtype" not in state:
13851385
state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
13861386

1387+
if "_codes" in state and "_ndarray" not in state:
1388+
# backward compat, changed what is property vs attribute
1389+
state["_ndarray"] = state.pop("_codes")
1390+
13871391
for k, v in state.items():
13881392
setattr(self, k, v)
13891393

@@ -1785,11 +1789,11 @@ def fillna(self, value=None, method=None, limit=None):
17851789
# NDArrayBackedExtensionArray compat
17861790

17871791
@property
1788-
def _ndarray(self) -> np.ndarray:
1789-
return self._codes
1792+
def _codes(self) -> np.ndarray:
1793+
return self._ndarray
17901794

17911795
def _from_backing_data(self, arr: np.ndarray) -> Categorical:
1792-
return self._constructor(arr, dtype=self.dtype, fastpath=True)
1796+
return type(self)(arr, dtype=self.dtype, fastpath=True)
17931797

17941798
def _box_func(self, i: int):
17951799
if i == -1:
@@ -1800,7 +1804,7 @@ def _unbox_scalar(self, key) -> int:
18001804
# searchsorted is very performance sensitive. By converting codes
18011805
# to same dtype as self.codes, we get much faster performance.
18021806
code = self.categories.get_loc(key)
1803-
code = self._codes.dtype.type(code)
1807+
code = self._ndarray.dtype.type(code)
18041808
return code
18051809

18061810
# ------------------------------------------------------------------
@@ -2162,7 +2166,7 @@ def unique(self):
21622166
cat = self.copy()
21632167

21642168
# keep nan in codes
2165-
cat._codes = unique_codes
2169+
cat._ndarray = unique_codes
21662170

21672171
# exclude nan from indexer for categories
21682172
take_codes = unique_codes[unique_codes != -1]

‎pandas/tests/indexes/categorical/test_category.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def test_engine_type(self, dtype, engine_type):
351351
# having 2**32 - 2**31 categories would be very memory-intensive,
352352
# so we cheat a bit with the dtype
353353
ci = CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
354-
ci.values._codes = ci.values._codes.astype("int64")
354+
ci.values._ndarray = ci.values._ndarray.astype("int64")
355355
assert np.issubdtype(ci.codes.dtype, dtype)
356356
assert isinstance(ci._engine, engine_type)
357357

0 commit comments

Comments
 (0)