Skip to content

Commit

Permalink
ARROW-6084: [Python] Support LargeList
Browse files Browse the repository at this point in the history
Closes apache#4979 from pitrou/ARROW-6084-py-large-list and squashes the following commits:

4266ea2 <Antoine Pitrou> ARROW-6084:  Support LargeList

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
  • Loading branch information
pitrou authored and wesm committed Aug 6, 2019
1 parent d9b0ef1 commit 2774cfb
Show file tree
Hide file tree
Showing 18 changed files with 356 additions and 29 deletions.
27 changes: 18 additions & 9 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -582,27 +582,30 @@ class StringConverter
// ----------------------------------------------------------------------
// Convert lists (NumPy arrays containing lists or ndarrays as values)

class ListConverter : public TypedConverter<ListType, ListConverter> {
template <typename TypeClass>
class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>> {
public:
using BuilderType = typename TypeTraits<TypeClass>::BuilderType;

explicit ListConverter(bool from_pandas, bool strict_conversions)
: from_pandas_(from_pandas), strict_conversions_(strict_conversions) {}

Status Init(ArrayBuilder* builder) {
builder_ = builder;
typed_builder_ = checked_cast<ListBuilder*>(builder);
this->builder_ = builder;
this->typed_builder_ = checked_cast<BuilderType*>(builder);

value_type_ = checked_cast<const ListType&>(*builder->type()).value_type();
value_type_ = checked_cast<const TypeClass&>(*builder->type()).value_type();
RETURN_NOT_OK(
GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_));
return value_converter_->Init(typed_builder_->value_builder());
return value_converter_->Init(this->typed_builder_->value_builder());
}

template <int NUMPY_TYPE, typename Type>
Status AppendNdarrayTypedItem(PyArrayObject* arr);
Status AppendNdarrayItem(PyObject* arr);

Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(typed_builder_->Append());
RETURN_NOT_OK(this->typed_builder_->Append());
if (PyArray_Check(obj)) {
return AppendNdarrayItem(obj);
}
Expand All @@ -625,8 +628,9 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
bool strict_conversions_;
};

template <typename TypeClass>
template <int NUMPY_TYPE, typename Type>
Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
Status ListConverter<TypeClass>::AppendNdarrayTypedItem(PyArrayObject* arr) {
using traits = internal::npy_traits<NUMPY_TYPE>;
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
Expand Down Expand Up @@ -673,7 +677,8 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
return value_converter_->AppendMultiple(obj, value_length); \
}

Status ListConverter::AppendNdarrayItem(PyObject* obj) {
template <typename TypeClass>
Status ListConverter<TypeClass>::AppendNdarrayItem(PyObject* obj) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);

if (PyArray_NDIM(arr) != 1) {
Expand Down Expand Up @@ -914,7 +919,11 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
}
case Type::LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter(from_pandas, strict_conversions));
new ListConverter<ListType>(from_pandas, strict_conversions));
break;
case Type::LARGE_LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter<LargeListType>(from_pandas, strict_conversions));
break;
case Type::STRUCT:
*out = std::unique_ptr<SeqConverter>(
Expand Down
2 changes: 2 additions & 0 deletions docs/source/python/api/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ may expose data type-specific methods or properties.
Decimal128Array
DictionaryArray
ListArray
LargeListArray
StructArray
UnionArray

Expand Down Expand Up @@ -109,5 +110,6 @@ any of those classes directly.
DecimalValue
DictionaryValue
ListValue
LargeListValue
StructValue
UnionValue
2 changes: 2 additions & 0 deletions docs/source/python/api/datatypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ These should be used to create Arrow data types and schemas.
large_utf8
decimal128
list_
large_list
struct
dictionary
field
Expand Down Expand Up @@ -117,6 +118,7 @@ represents a given data type (such as ``int32``) or general category
is_float64
is_decimal
is_list
is_large_list
is_struct
is_union
is_nested
Expand Down
10 changes: 6 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ def parse_git(root, **kwargs):
binary, string, utf8,
large_binary, large_string, large_utf8,
decimal128,
list_, struct, union, dictionary, field,
list_, large_list, struct, union, dictionary, field,
type_for_alias,
DataType, DictionaryType, ListType, StructType,
DataType, DictionaryType, StructType,
ListType, LargeListType,
UnionType, TimestampType, Time32Type, Time64Type,
FixedSizeBinaryType, Decimal128Type,
BaseExtensionType, ExtensionType,
Expand All @@ -77,7 +78,7 @@ def parse_git(root, **kwargs):
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
ListArray, UnionArray,
ListArray, LargeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
FixedSizeBinaryArray,
Expand All @@ -89,7 +90,8 @@ def parse_git(root, **kwargs):
BooleanValue,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
HalfFloatValue, FloatValue, DoubleValue, ListValue,
HalfFloatValue, FloatValue, DoubleValue,
ListValue, LargeListValue,
BinaryValue, StringValue,
LargeBinaryValue, LargeStringValue,
FixedSizeBinaryValue,
Expand Down
47 changes: 47 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,52 @@ cdef class ListArray(Array):
return pyarrow_wrap_array(arr.values())


cdef class LargeListArray(Array):
"""
Concrete class for Arrow arrays of a large list data type
(like ListArray, but 64-bit offsets).
"""

@staticmethod
def from_arrays(offsets, values, MemoryPool pool=None):
"""
Construct LargeListArray from arrays of int64 offsets and values
Parameters
----------
offset : Array (int64 type)
values : Array (any type)
Returns
-------
list_array : LargeListArray
"""
cdef:
Array _offsets, _values
shared_ptr[CArray] out
cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)

_offsets = asarray(offsets, type='int64')
_values = asarray(values)

with nogil:
check_status(CLargeListArray.FromArrays(_offsets.ap[0],
_values.ap[0],
cpool, &out))
return pyarrow_wrap_array(out)

def flatten(self):
"""
Unnest this LargeListArray by one level
Returns
-------
result : Array
"""
cdef CLargeListArray* arr = <CLargeListArray*> self.ap
return pyarrow_wrap_array(arr.values())


cdef class UnionArray(Array):
"""
Concrete class for Arrow arrays of a Union data type.
Expand Down Expand Up @@ -1511,6 +1557,7 @@ cdef dict _array_classes = {
_Type_FLOAT: FloatArray,
_Type_DOUBLE: DoubleArray,
_Type_LIST: ListArray,
_Type_LARGE_LIST: LargeListArray,
_Type_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
Expand Down
18 changes: 18 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"

_Type_LIST" arrow::Type::LIST"
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
_Type_STRUCT" arrow::Type::STRUCT"
_Type_UNION" arrow::Type::UNION"
_Type_DICTIONARY" arrow::Type::DICTIONARY"
Expand Down Expand Up @@ -252,6 +253,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CDataType] value_type()
shared_ptr[CField] value_field()

cdef cppclass CLargeListType" arrow::LargeListType"(CDataType):
CLargeListType(const shared_ptr[CDataType]& value_type)
CLargeListType(const shared_ptr[CField]& field)
shared_ptr[CDataType] value_type()
shared_ptr[CField] value_field()

cdef cppclass CStringType" arrow::StringType"(CDataType):
pass

Expand Down Expand Up @@ -419,6 +426,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()

cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray):
@staticmethod
CStatus FromArrays(const CArray& offsets, const CArray& values,
CMemoryPool* pool, shared_ptr[CArray]* out)

const int64_t* raw_value_offsets()
int64_t value_offset(int i)
int64_t value_length(int i)
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()

cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
@staticmethod
CStatus MakeSparse(const CArray& type_ids,
Expand Down
20 changes: 20 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ cdef class ListType(DataType):
const CListType* list_type


cdef class LargeListType(DataType):
cdef:
const CLargeListType* list_type


cdef class StructType(DataType):
cdef:
const CStructType* struct_type
Expand Down Expand Up @@ -184,6 +189,17 @@ cdef class ListValue(ArrayValue):
cdef int64_t length(self)


cdef class LargeListValue(ArrayValue):
cdef readonly:
DataType value_type

cdef:
CLargeListArray* ap

cdef getitem(self, int64_t i)
cdef int64_t length(self)


cdef class StructValue(ArrayValue):
cdef:
CStructArray* ap
Expand Down Expand Up @@ -336,6 +352,10 @@ cdef class ListArray(Array):
pass


cdef class LargeListArray(Array):
pass


cdef class UnionArray(Array):
pass

Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_LIST = _Type_LIST
Type_LARGE_LIST = _Type_LARGE_LIST
Type_STRUCT = _Type_STRUCT
Type_UNION = _Type_UNION
Type_DICTIONARY = _Type_DICTIONARY
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ cdef api object pyarrow_wrap_data_type(
out = DictionaryType.__new__(DictionaryType)
elif type.get().id() == _Type_LIST:
out = ListType.__new__(ListType)
elif type.get().id() == _Type_LARGE_LIST:
out = LargeListType.__new__(LargeListType)
elif type.get().id() == _Type_STRUCT:
out = StructType.__new__(StructType)
elif type.get().id() == _Type_UNION:
Expand Down
52 changes: 52 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,57 @@ cdef class ListValue(ArrayValue):
return result


cdef class LargeListValue(ArrayValue):
"""
Concrete class for large list array elements.
"""

def __len__(self):
"""
Return the number of values.
"""
return self.length()

def __getitem__(self, i):
"""
Return the value at the given index.
"""
return self.getitem(_normalize_index(i, self.length()))

def __iter__(self):
"""
Iterate over this element's values.
"""
for i in range(len(self)):
yield self.getitem(i)
raise StopIteration

cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
self.sp_array = sp_array
self.ap = <CLargeListArray*> sp_array.get()
self.value_type = pyarrow_wrap_data_type(self.ap.value_type())

cdef getitem(self, int64_t i):
cdef int64_t j = self.ap.value_offset(self.index) + i
return box_scalar(self.value_type, self.ap.values(), j)

cdef int64_t length(self):
return self.ap.value_length(self.index)

def as_py(self):
"""
Return this value as a Python list.
"""
cdef:
int64_t j
list result = []

for j in range(len(self)):
result.append(self.getitem(j).as_py())

return result


cdef class UnionValue(ArrayValue):
"""
Concrete class for union array elements.
Expand Down Expand Up @@ -729,6 +780,7 @@ cdef dict _array_value_classes = {
_Type_FLOAT: FloatValue,
_Type_DOUBLE: DoubleValue,
_Type_LIST: ListValue,
_Type_LARGE_LIST: LargeListValue,
_Type_UNION: UnionValue,
_Type_BINARY: BinaryValue,
_Type_STRING: StringValue,
Expand Down
12 changes: 9 additions & 3 deletions python/pyarrow/tests/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,10 @@ def fields(type_strategy=primitive_types):


def list_types(item_strategy=primitive_types):
return st.builds(pa.list_, item_strategy)
return (
st.builds(pa.list_, item_strategy) |
st.builds(pa.large_list, item_strategy)
)


def struct_types(item_strategy=primitive_types):
Expand Down Expand Up @@ -159,11 +162,14 @@ def arrays(draw, type, size=None):

shape = (size,)

if pa.types.is_list(type):
if pa.types.is_list(type) or pa.types.is_large_list(type):
offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero
values = draw(arrays(type.value_type, size=int(offsets.sum())))
return pa.ListArray.from_arrays(offsets, values)
array_type = (
pa.LargeListArray if pa.types.is_large_list(type)
else pa.ListArray)
return array_type.from_arrays(offsets, values)

if pa.types.is_struct(type):
h.assume(len(type) > 0)
Expand Down
Loading

0 comments on commit 2774cfb

Please sign in to comment.