Skip to content

Commit

Permalink
ARROW-5531: [Python] Implement Array.from_buffers for varbinary and n…
Browse files Browse the repository at this point in the history
…ested types, add DataType.num_buffers property

Thanks to Antoine's recent work on `Array::View` this method can be made more robust and safe by checking for the correct number of buffers.

Author: Wes McKinney <[email protected]>

Closes apache#4537 from wesm/ARROW-5531 and squashes the following commits:

ec0695d <Wes McKinney> Address code review feedback
a725338 <Wes McKinney> Implement Array.from_buffers for nested types, add DataType.num_buffers, more checks
  • Loading branch information
wesm committed Jun 13, 2019
1 parent 740e729 commit f068424
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 67 deletions.
53 changes: 32 additions & 21 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -561,9 +561,10 @@ cdef class Array(_PandasConvertible):
(_reduce_array_data(self.sp_array.get().data().get()),)

@staticmethod
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
children=None):
"""
Construct an Array from a sequence of buffers. The concrete type
Construct an Array from a sequence of buffers. The concrete type
returned depends on the datatype.
Parameters
Expand All @@ -578,26 +579,45 @@ cdef class Array(_PandasConvertible):
offset : int, default 0
The array's logical offset (in values, not in bytes) from the
start of each buffer
children : List[Array], default None
Nested type children with length matching type.num_children
Returns
-------
array : Array
"""
cdef:
Buffer buf
Array child
vector[shared_ptr[CBuffer]] c_buffers
shared_ptr[CArrayData] ad
vector[shared_ptr[CArrayData]] c_child_data
shared_ptr[CArrayData] array_data

if not is_primitive(type.id):
raise NotImplementedError("from_buffers is only supported for "
"primitive arrays yet.")
children = children or []

if type.num_children != len(children):
raise ValueError("Type's expected number of children "
"({0}) did not match the passed number "
"({1}).".format(type.num_children, len(children)))

if type.num_buffers != len(buffers):
raise ValueError("Type's expected number of buffers "
"({0}) did not match the passed number "
"({1}).".format(type.num_buffers, len(buffers)))

for buf in buffers:
# None will produce a null buffer pointer
c_buffers.push_back(pyarrow_unwrap_buffer(buf))
ad = CArrayData.Make(type.sp_type, length, c_buffers,
null_count, offset)
return pyarrow_wrap_array(MakeArray(ad))

for child in children:
c_child_data.push_back(child.ap.data())

array_data = CArrayData.MakeWithChildren(type.sp_type, length,
c_buffers, c_child_data,
null_count, offset)
cdef Array result = pyarrow_wrap_array(MakeArray(array_data))
result.validate()
return result

@property
def null_count(self):
Expand Down Expand Up @@ -1214,18 +1234,9 @@ cdef class StringArray(Array):
-------
string_array : StringArray
"""
cdef shared_ptr[CBuffer] c_null_bitmap
cdef shared_ptr[CArray] out

if null_bitmap is not None:
c_null_bitmap = null_bitmap.buffer
else:
null_count = 0

out.reset(new CStringArray(
length, value_offsets.buffer, data.buffer, c_null_bitmap,
null_count, offset))
return pyarrow_wrap_array(out)
return Array.from_buffers(utf8(), length,
[null_bitmap, value_offsets, data],
null_count, offset)


cdef class BinaryArray(Array):
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
TimeUnit_MICRO" arrow::TimeUnit::MICRO"
TimeUnit_NANO" arrow::TimeUnit::NANO"

cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
vector[int64_t] bit_widths
c_bool has_dictionary

cdef cppclass CDataType" arrow::DataType":
Type id()

Expand All @@ -94,6 +98,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:

int num_children()

CDataTypeLayout layout()

c_string ToString()

c_bool is_primitive(Type type)
Expand Down
107 changes: 75 additions & 32 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,81 @@ def test_array_from_buffers():
with pytest.raises(TypeError):
pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1)

with pytest.raises(NotImplementedError):
pa.Array.from_buffers(pa.list_(pa.int16()), 4, [None, values_buf])

def test_string_binary_from_buffers():
array = pa.array(["a", None, "b", "c"])

buffers = array.buffers()
copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0], array.null_count,
array.offset)
assert copied.to_pylist() == ["a", None, "b", "c"]

binary_copy = pa.Array.from_buffers(pa.binary(), len(array),
array.buffers(), array.null_count,
array.offset)
assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"]

copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0])
assert copied.to_pylist() == ["a", None, "b", "c"]

sliced = array[1:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
assert copied.to_pylist() == [None, "b", "c"]
assert copied.null_count == 1

# Slice but exclude all null entries so that we don't need to pass
# the null bitmap.
sliced = array[2:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
assert copied.to_pylist() == ["b", "c"]
assert copied.null_count == 0


def test_list_from_buffers():
ty = pa.list_(pa.int16())
array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)

buffers = array.buffers()

with pytest.raises(ValueError):
# No children
pa.Array.from_buffers(ty, 4, [None, buffers[1]])

child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
assert copied.equals(array)

with pytest.raises(ValueError):
# too many children
pa.Array.from_buffers(ty, 4, [None, buffers[1]],
children=[child, child])


def test_struct_from_buffers():
ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())])
array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}],
type=ty)
buffers = array.buffers()

with pytest.raises(ValueError):
# No children
pa.Array.from_buffers(ty, 3, [None, buffers[1]])

children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]),
pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])]
copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children)
assert copied.equals(array)

with pytest.raises(ValueError):
# not enough many children
pa.Array.from_buffers(ty, 3, [buffers[0]],
children=children[:1])


def test_dictionary_from_numpy():
Expand Down Expand Up @@ -499,36 +572,6 @@ def test_union_array_slice():
assert arr[i:j].to_pylist() == lst[i:j]


def test_string_from_buffers():
array = pa.array(["a", None, "b", "c"])

buffers = array.buffers()
copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0], array.null_count,
array.offset)
assert copied.to_pylist() == ["a", None, "b", "c"]

copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0])
assert copied.to_pylist() == ["a", None, "b", "c"]

sliced = array[1:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
assert copied.to_pylist() == [None, "b", "c"]
assert copied.null_count == 1

# Slice but exclude all null entries so that we don't need to pass
# the null bitmap.
sliced = array[2:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
assert copied.to_pylist() == ["b", "c"]
assert copied.null_count == 0


def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
if isinstance(out_data, pa.Array):
Expand Down
29 changes: 15 additions & 14 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,21 @@ cdef class DataType:
raise ValueError("Non-fixed width type")
return ty.bit_width()

@property
def num_children(self):
"""
The number of child fields.
"""
return self.type.num_children()

@property
def num_buffers(self):
"""
Number of data buffers required to construct Array type
excluding children
"""
return self.type.layout().bit_widths.size()

def __str__(self):
return frombytes(self.type.ToString())

Expand Down Expand Up @@ -297,13 +312,6 @@ cdef class StructType(DataType):
def __reduce__(self):
return struct, (list(self),)

@property
def num_children(self):
"""
The number of struct fields.
"""
return self.type.num_children()


cdef class UnionType(DataType):
"""
Expand All @@ -313,13 +321,6 @@ cdef class UnionType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)

@property
def num_children(self):
"""
The number of union members.
"""
return self.type.num_children()

@property
def mode(self):
"""
Expand Down

0 comments on commit f068424

Please sign in to comment.