Skip to content

Commit

Permalink
API: Allow other na values in StringArray Constructor (pandas-dev#45168)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 authored Jan 17, 2022
1 parent 23f12a1 commit e6a20bd
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 15 deletions.
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Series,
)
from pandas.arrays import StringArray

from .pandas_vb_common import tm

Expand Down Expand Up @@ -285,3 +287,18 @@ class Iter(Dtypes):
def time_iter(self, dtype):
for i in self.s:
pass


class StringArrayConstruction:
def setup(self):
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])

def time_string_array_construction(self):
StringArray(self.series_arr)

def time_string_array_with_nan_construction(self):
StringArray(self.series_arr_nan)

def peakmem_stringarray_construction(self):
StringArray(self.series_arr)
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
-

Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ def astype_intsafe(
arr: npt.NDArray[np.object_],
new_dtype: np.dtype,
) -> np.ndarray: ...
def convert_nans_to_NA(
arr: npt.NDArray[np.object_],
) -> npt.NDArray[np.object_]: ...
def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...

# TODO: can we be more specific about rows?
Expand Down
38 changes: 34 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,40 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:

return result

ctypedef fused ndarr_object:
ndarray[object, ndim=1]
ndarray[object, ndim=2]

# TODO: get rid of this in StringArray and modify
# and go through ensure_string_array instead
@cython.wraparound(False)
@cython.boundscheck(False)
def convert_nans_to_NA(ndarr_object arr) -> ndarray:
"""
Helper for StringArray that converts null values that
are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
have already been validated as null.
"""
cdef:
Py_ssize_t i, m, n
object val
ndarr_object result
result = np.asarray(arr, dtype="object")
if arr.ndim == 2:
m, n = arr.shape[0], arr.shape[1]
for i in range(m):
for j in range(n):
val = arr[i, j]
if not isinstance(val, str):
result[i, j] = <object>C_NA
else:
n = len(arr)
for i in range(n):
val = arr[i]
if not isinstance(val, str):
result[i] = <object>C_NA
return result


@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down Expand Up @@ -1880,10 +1914,6 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA


cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
.. warning::
Currently, this expects an object-dtype ndarray
where the elements are Python strings or :attr:`pandas.NA`.
where the elements are Python strings
or nan-likes (``None``, ``np.nan``, ``NA``).
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.
.. versionchanged:: 1.5.0
StringArray now accepts array-likes containing
nan-likes(``None``, ``np.nan``) for the ``values`` parameter
in addition to strings and :attr:`pandas.NA`
copy : bool, default False
Whether to copy the array of data.
Expand Down Expand Up @@ -310,11 +317,11 @@ def __init__(self, values, copy=False):
values = extract_array(values)

super().__init__(values, copy=copy)
if not isinstance(values, type(self)):
self._validate()
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
if not isinstance(values, type(self)):
self._validate()

def _validate(self):
"""Validate that we only store NA or strings."""
Expand All @@ -325,6 +332,12 @@ def _validate(self):
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
# Check to see if need to convert Na values to pd.NA
if self._ndarray.ndim > 2:
# Ravel if ndims > 2 b/c no cythonized version available
lib.convert_nans_to_NA(self._ndarray.ravel("K"))
else:
lib.convert_nans_to_NA(self._ndarray)

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,15 +267,20 @@ def test_constructor_raises(cls):
cls(np.array([]))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.nan], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", None], dtype=object))
cls(np.array(["a", np.datetime64("nat")], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))


@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
def test_constructor_nan_like(na):
expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
tm.assert_extension_array_equal(
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
)


@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, request):
if cls is ArrowStringArray and copy is False:
Expand Down
20 changes: 16 additions & 4 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1534,19 +1534,31 @@ def test_is_numeric_array(self):
assert not lib.is_integer_array(np.array([1, 2.0]))

def test_is_string_array(self):

# We should only be accepting pd.NA, np.nan,
# other floating point nans e.g. float('nan')]
# when skipna is True.
assert lib.is_string_array(np.array(["foo", "bar"]))
assert not lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
)
assert lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
)
# NaN is not valid for string array, just NA
assert not lib.is_string_array(
assert lib.is_string_array(
np.array(["foo", "bar", None], dtype=object), skipna=True
)
assert lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
)

assert not lib.is_string_array(
np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
)
assert not lib.is_string_array(
np.array(["foo", "bar", None], dtype=object), skipna=False
)
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
)
assert not lib.is_string_array(np.array([1, 2]))

def test_to_object_array_tuples(self):
Expand Down

0 comments on commit e6a20bd

Please sign in to comment.