Skip to content

Commit

Permalink
ARROW-4350: [Python] Fix conversion from Python to Arrow with nested …
Browse files Browse the repository at this point in the history
…lists and NumPy dtype=object items

NumPy object array values weren't being iterated over in the case where the value type is `list<T>` instead of some primitive type like `int64`.

This code path appears to have never been properly tested, so when a user hit it, it didn't work.

Author: Wes McKinney <[email protected]>

Closes apache#4609 from wesm/ARROW-4350 and squashes the following commits:

6d7883e <Wes McKinney> Code review feedback
d2f3831 <Wes McKinney> Add xfailing unit test for ARROW-5645
b2b3c50 <Wes McKinney> Remove unneeded namespace
a74e5cc <Wes McKinney> Actually unbox NPY_OBJECT arrays in ListConverter
279f681 <Wes McKinney> Expose infer_type function
  • Loading branch information
wesm committed Jun 19, 2019
1 parent 726f90f commit c5d2fc5
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 12 deletions.
5 changes: 5 additions & 0 deletions cpp/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,11 @@ Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) {
return Status::OK();
}

void DebugPrint(PyObject* obj) {
std::string repr = PyObject_StdStringRepr(obj);
PySys_WriteStderr("%s\n", repr.c_str());
}

} // namespace internal
} // namespace py
} // namespace arrow
3 changes: 3 additions & 0 deletions cpp/src/arrow/python/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
ARROW_PYTHON_EXPORT
Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);

// \brief Print Python object __repr__
void DebugPrint(PyObject* obj);

} // namespace internal
} // namespace py
} // namespace arrow
Expand Down
18 changes: 12 additions & 6 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,9 @@ class TypedConverter : public SeqConverter {
RETURN_NOT_OK(this->typed_builder_->Reserve(size));
// Iterate over the items adding each one
auto self = checked_cast<Derived*>(this);
return internal::VisitSequence(obj,
[self](PyObject* item, bool* keep_going /* unused */) {
return self->AppendSingle(item);
});
return internal::VisitSequence(obj, [self](PyObject* item, bool* /* unused */) {
return self->AppendSingle(item);
});
}

Status AppendMultipleMasked(PyObject* obj, PyObject* mask, int64_t size) override {
Expand All @@ -234,7 +233,7 @@ class TypedConverter : public SeqConverter {
// Iterate over the items adding each one
auto self = checked_cast<Derived*>(this);
return internal::VisitSequenceMasked(
obj, mask, [self](PyObject* item, bool is_masked, bool* keep_going /* unused */) {
obj, mask, [self](PyObject* item, bool is_masked, bool* /* unused */) {
if (is_masked) {
return self->AppendNull();
} else {
Expand Down Expand Up @@ -699,7 +698,14 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) {
LIST_SLOW_CASE(FIXED_SIZE_BINARY)
LIST_SLOW_CASE(STRING)
case Type::LIST: {
return value_converter_->AppendSingleVirtual(obj);
if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) {
return Status::Invalid(
"Can only convert list types from NumPy object "
"array input");
}
return internal::VisitSequence(obj, [this](PyObject* item, bool*) {
return value_converter_->AppendSingleVirtual(item);
});
}
default: {
return Status::TypeError("Unknown list item type: ", value_type_->ToString());
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def parse_git(root, **kwargs):
schema,
Array, Tensor,
array, chunked_array, column,
from_numpy_dtype,
infer_type, from_numpy_dtype,
NullArray,
NumericArray, IntegerArray, FloatingPointArray,
BooleanArray,
Expand Down
23 changes: 23 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,29 @@ def asarray(values, type=None):
return array(values, type=type)


def infer_type(values, from_pandas=False):
"""
Attempt to infer Arrow data type that can hold the passed Python
sequence type in an Array object
Parameters
----------
values : array-like
from_pandas : boolean, default False
Use pandas's NA/null sentinel values for type inference
Returns
-------
type : DataType
"""
cdef:
shared_ptr[CDataType] out
c_bool use_pandas_sentinels = from_pandas

check_status(InferArrowType(values, use_pandas_sentinels, &out))
return pyarrow_wrap_data_type(out)


def _normalize_slice(object arrow_obj, slice key):
cdef:
Py_ssize_t start, stop, step
Expand Down
8 changes: 6 additions & 2 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1149,6 +1149,12 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
CDatum* out)


cdef extern from "arrow/python/api.h" namespace "arrow::py":
# Requires GIL
CStatus InferArrowType(object obj, c_bool pandas_null_sentinels,
shared_ptr[CDataType]* out_type)


cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CDataType] GetPrimitiveType(Type type)
shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
Expand Down Expand Up @@ -1234,8 +1240,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool use_threads
c_bool deduplicate_objects

cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:

cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
shared_ptr[CRecordBatch] batch
vector[shared_ptr[CTensor]] tensors
Expand Down
45 changes: 45 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,51 @@ def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
[[1., 2.], [1., 2., 3.], [np.nan], None])


def test_nested_ndarray_in_object_array():
# ARROW-4350
arr = np.empty(2, dtype=object)
arr[:] = [np.array([1, 2], dtype=np.int64),
np.array([2, 3], dtype=np.int64)]

arr2 = np.empty(2, dtype=object)
arr2[0] = [3, 4]
arr2[1] = [5, 6]

expected_type = pa.list_(pa.list_(pa.int64()))
assert pa.infer_type([arr]) == expected_type

result = pa.array([arr, arr2])
expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
type=expected_type)

assert result.equals(expected)

# test case for len-1 arrays to ensure they are interpreted as
# sublists and not scalars
arr = np.empty(2, dtype=object)
arr[:] = [np.array([1]), np.array([2])]
result = pa.array([arr, arr])
assert result.to_pylist() == [[[1], [2]], [[1], [2]]]


@pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
"not yet implemented"),
raises=AssertionError)
def test_multidimensional_ndarray_as_nested_list():
# TODO(wesm): see ARROW-5645
arr = np.array([[1, 2], [2, 3]], dtype=np.int64)
arr2 = np.array([[3, 4], [5, 6]], dtype=np.int64)

expected_type = pa.list_(pa.list_(pa.int64()))
assert pa.infer_type([arr]) == expected_type

result = pa.array([arr, arr2])
expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
type=expected_type)

assert result.equals(expected)


def test_array_ignore_nan_from_pandas():
# See ARROW-4324, this reverts logic that was introduced in
# ARROW-2240
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1566,17 +1566,18 @@ cpdef ListType list_(value_type):
"""
cdef:
DataType data_type
Field field
Field _field
shared_ptr[CDataType] list_type
ListType out = ListType.__new__(ListType)

if isinstance(value_type, DataType):
list_type.reset(new CListType((<DataType> value_type).sp_type))
_field = field('item', value_type)
elif isinstance(value_type, Field):
list_type.reset(new CListType((<Field> value_type).sp_field))
_field = value_type
else:
raise TypeError('List requires DataType or Field')

list_type.reset(new CListType(_field.sp_field))
out.init(list_type)
return out

Expand Down

0 comments on commit c5d2fc5

Please sign in to comment.