From e6a20bdef21a8420672f9d564b82e16faebb64ba Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 16 Jan 2022 17:02:59 -0800 Subject: [PATCH] API: Allow other na values in StringArray Constructor (#45168) --- asv_bench/benchmarks/strings.py | 17 ++++++++++ doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/lib.pyi | 3 ++ pandas/_libs/lib.pyx | 38 +++++++++++++++++++--- pandas/core/arrays/string_.py | 19 +++++++++-- pandas/tests/arrays/string_/test_string.py | 13 +++++--- pandas/tests/dtypes/test_inference.py | 20 +++++++++--- 7 files changed, 96 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 32fbf4e6c7de3..85487f5d531a3 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,10 +3,12 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Series, ) +from pandas.arrays import StringArray from .pandas_vb_common import tm @@ -285,3 +287,18 @@ class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 290f2e0ae08b6..a4ac322a030ca 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -30,6 +30,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`) - diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index a7ebd9d0c77ad..6a1519c827c7a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -161,6 +161,9 @@ def astype_intsafe( arr: npt.NDArray[np.object_], new_dtype: np.dtype, ) -> np.ndarray: ... +def convert_nans_to_NA( + arr: npt.NDArray[np.object_], +) -> npt.NDArray[np.object_]: ... def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ... # TODO: can we be more specific about rows? diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ee317814bf79b..9a82e89481b45 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -669,6 +669,40 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: return result +ctypedef fused ndarr_object: + ndarray[object, ndim=1] + ndarray[object, ndim=2] + +# TODO: get rid of this in StringArray and modify +# and go through ensure_string_array instead +@cython.wraparound(False) +@cython.boundscheck(False) +def convert_nans_to_NA(ndarr_object arr) -> ndarray: + """ + Helper for StringArray that converts null values that + are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements + have already been validated as null. + """ + cdef: + Py_ssize_t i, m, n + object val + ndarr_object result + result = np.asarray(arr, dtype="object") + if arr.ndim == 2: + m, n = arr.shape[0], arr.shape[1] + for i in range(m): + for j in range(n): + val = arr[i, j] + if not isinstance(val, str): + result[i, j] = C_NA + else: + n = len(arr) + for i in range(n): + val = arr[i] + if not isinstance(val, str): + result[i] = C_NA + return result + @cython.wraparound(False) @cython.boundscheck(False) @@ -1880,10 +1914,6 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA - cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 919b882f22ecb..b79e915fa6c94 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes (``None``, ``np.nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.5.0 + + StringArray now accepts array-likes containing + nan-likes(``None``, ``np.nan``) for the ``values`` parameter + in addition to strings and :attr:`pandas.NA` + copy : bool, default False Whether to copy the array of data. @@ -310,11 +317,11 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" @@ -325,6 +332,12 @@ def _validate(self): "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + # Check to see if need to convert Na values to pd.NA + if self._ndarray.ndim > 2: + # Ravel if ndims > 2 b/c no cythonized version available + lib.convert_nans_to_NA(self._ndarray.ravel("K")) + else: + lib.convert_nans_to_NA(self._ndarray) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 22fe7bb0de949..0919d57f9e612 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -267,15 +267,20 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) - - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) + cls(np.array(["a", np.datetime64("nat")], dtype=object)) with pytest.raises(ValueError, match=msg): cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) + + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray and copy is False: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7953d650636be..9ae5b42161b73 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1534,7 +1534,9 @@ def test_is_numeric_array(self): assert not lib.is_integer_array(np.array([1, 2.0])) def test_is_string_array(self): - + # We should only be accepting pd.NA, np.nan, + # other floating point nans e.g. float('nan')] + # when skipna is True. assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=False @@ -1542,11 +1544,21 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) - + assert not lib.is_string_array( + np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self):