API: Allow other na values in StringArray Constructor (pandas-dev#45168)

adana315 · Jan 17, 2022 · e6a20bd · e6a20bd
1 parent 23f12a1
commit e6a20bd
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 15 deletions.
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -3,10 +3,12 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Series,
 )
+from pandas.arrays import StringArray
 
 from .pandas_vb_common import tm
 
@@ -285,3 +287,18 @@ class Iter(Dtypes):
     def time_iter(self, dtype):
         for i in self.s:
             pass
+
+
+class StringArrayConstruction:
+    def setup(self):
+        self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
+        self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
+
+    def time_string_array_construction(self):
+        StringArray(self.series_arr)
+
+    def time_string_array_with_nan_construction(self):
+        StringArray(self.series_arr_nan)
+
+    def peakmem_stringarray_construction(self):
+        StringArray(self.series_arr)
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -30,6 +30,7 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
 - Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
 -
 

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -161,6 +161,9 @@ def astype_intsafe(
     arr: npt.NDArray[np.object_],
     new_dtype: np.dtype,
 ) -> np.ndarray: ...
+def convert_nans_to_NA(
+    arr: npt.NDArray[np.object_],
+) -> npt.NDArray[np.object_]: ...
 def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
 
 # TODO: can we be more specific about rows?

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -669,6 +669,40 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
 
     return result
 
+ctypedef fused ndarr_object:
+    ndarray[object, ndim=1]
+    ndarray[object, ndim=2]
+
+# TODO: get rid of this in StringArray and modify
+#  and go through ensure_string_array instead
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def convert_nans_to_NA(ndarr_object arr) -> ndarray:
+    """
+    Helper for StringArray that converts null values that
+    are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
+    have already been validated as null.
+    """
+    cdef:
+        Py_ssize_t i, m, n
+        object val
+        ndarr_object result
+    result = np.asarray(arr, dtype="object")
+    if arr.ndim == 2:
+        m, n = arr.shape[0], arr.shape[1]
+        for i in range(m):
+            for j in range(n):
+                val = arr[i, j]
+                if not isinstance(val, str):
+                    result[i, j] = <object>C_NA
+    else:
+        n = len(arr)
+        for i in range(n):
+            val = arr[i]
+            if not isinstance(val, str):
+                result[i] = <object>C_NA
+    return result
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -1880,10 +1914,6 @@ cdef class StringValidator(Validator):
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
 
-    cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
-
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes (``None``, ``np.nan``, ``NA``).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.5.0
+
+           StringArray now accepts array-likes containing
+           nan-likes(``None``, ``np.nan``) for the ``values`` parameter
+           in addition to strings and :attr:`pandas.NA`
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -310,11 +317,11 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
@@ -325,6 +332,12 @@ def _validate(self):
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        # Check to see if need to convert Na values to pd.NA
+        if self._ndarray.ndim > 2:
+            # Ravel if ndims > 2 b/c no cythonized version available
+            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+        else:
+            lib.convert_nans_to_NA(self._ndarray)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -267,15 +267,20 @@ def test_constructor_raises(cls):
         cls(np.array([]))
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
-
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
+        cls(np.array(["a", np.datetime64("nat")], dtype=object))
 
     with pytest.raises(ValueError, match=msg):
         cls(np.array(["a", pd.NaT], dtype=object))
 
 
+@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
+
+
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):
     if cls is ArrowStringArray and copy is False:

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -1534,19 +1534,31 @@ def test_is_numeric_array(self):
         assert not lib.is_integer_array(np.array([1, 2.0]))
 
     def test_is_string_array(self):
-
+        # We should only be accepting pd.NA, np.nan,
+        # other floating point nans e.g. float('nan')]
+        # when skipna is True.
         assert lib.is_string_array(np.array(["foo", "bar"]))
         assert not lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
         )
         assert lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
         )
-        # NaN is not valid for string array, just NA
-        assert not lib.is_string_array(
+        assert lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=True
+        )
+        assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
-
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=False
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=False
+        )
         assert not lib.is_string_array(np.array([1, 2]))
 
     def test_to_object_array_tuples(self):