Skip to content

Commit

Permalink
ARROW-8572: [Python] expose UnionArray fields to Python
Browse files Browse the repository at this point in the history
- Adds an explicit range check to `UnionArray.child`
- Exposes `child`, `value_offsets`, and `type_codes` to Python. (In Python, they're wrapped in arrays for you to save you the trouble.)

This lets you losslessly assemble and then disassemble a union array in Python.

Closes apache#7027 from lidavidm/arrow-8572

Authored-by: David Li <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
lidavidm authored and nealrichardson committed Apr 28, 2020
1 parent 0324a9c commit a43199b
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 10 deletions.
4 changes: 4 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,10 @@ Result<std::shared_ptr<Array>> UnionArray::MakeSparse(
}

std::shared_ptr<Array> UnionArray::child(int i) const {
if (i < 0 ||
static_cast<decltype(boxed_fields_)::size_type>(i) >= boxed_fields_.size()) {
return nullptr;
}
std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
if (!result) {
std::shared_ptr<ArrayData> child_data = data_->child_data[i]->Copy();
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/array_union_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ class TestUnionArrayFactories : public ::testing::Test {
for (int64_t i = 0; i < type_ids.length(); ++i) {
ASSERT_EQ(array.child_id(i), type_ids.Value(i));
}
ASSERT_EQ(nullptr, array.child(-1));
ASSERT_EQ(nullptr, array.child(type_ids.length()));
}

void CheckFieldNames(const UnionArray& array, const std::vector<std::string>& names) {
Expand Down
33 changes: 33 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1720,6 +1720,39 @@ cdef class UnionArray(Array):
Concrete class for Arrow arrays of a Union data type.
"""

def child(self, int pos):
"""
Return the given child array as an individual array.
For sparse unions, the returned array has its offset, length,
and null count adjusted.
For dense unions, the returned array is unchanged.
"""
cdef shared_ptr[CArray] result
result = (<CUnionArray*> self.ap).child(pos)
if result != NULL:
return pyarrow_wrap_array(result)
raise KeyError("UnionArray does not have child {}".format(pos))

@property
def type_codes(self):
"""Get the type codes array."""
buf = pyarrow_wrap_buffer((<CUnionArray*> self.ap).type_codes())
return Array.from_buffers(int8(), len(self), [None, buf])

@property
def offsets(self):
"""
Get the value offsets array (dense arrays only).
Does not account for any slice offset.
"""
if self.type.mode != "dense":
raise ArrowTypeError("Can only get value offsets for dense arrays")
buf = pyarrow_wrap_buffer((<CUnionArray*> self.ap).value_offsets())
return Array.from_buffers(int32(), len(self), [None, buf])

@staticmethod
def from_dense(Array types, Array value_offsets, list children,
list field_names=None, list type_codes=None):
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const vector[c_string]& field_names,
const vector[int8_t]& type_codes)

shared_ptr[CBuffer] type_codes()
int8_t* raw_type_codes()
int32_t value_offset(int i)
shared_ptr[CBuffer] value_offsets()
int child_id(int64_t index)
shared_ptr[CArray] child(int pos)
const CArray* UnsafeChild(int pos)
Expand Down
49 changes: 39 additions & 10 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,40 +739,54 @@ def test_union_from_dense():
value_offsets = pa.array([1, 0, 0, 2, 1, 2, 3], type='int32')
py_value = [b'b', 1, b'a', b'c', 2, 3, b'd']

def check_result(result, expected_field_names, expected_type_codes):
def check_result(result, expected_field_names, expected_type_codes,
expected_type_code_values):
result.validate(full=True)
actual_field_names = [result.type[i].name
for i in range(result.type.num_children)]
assert actual_field_names == expected_field_names
assert result.type.mode == "dense"
assert result.type.type_codes == expected_type_codes
assert result.to_pylist() == py_value
assert expected_type_code_values.equals(result.type_codes)
assert value_offsets.equals(result.offsets)
assert result.child(0).equals(binary)
assert result.child(1).equals(int64)
with pytest.raises(KeyError):
result.child(-1)
with pytest.raises(KeyError):
result.child(2)

# without field names and type codes
check_result(pa.UnionArray.from_dense(types, value_offsets,
[binary, int64]),
expected_field_names=['0', '1'],
expected_type_codes=[0, 1])
expected_type_codes=[0, 1],
expected_type_code_values=types)

# with field names
check_result(pa.UnionArray.from_dense(types, value_offsets,
[binary, int64],
['bin', 'int']),
expected_field_names=['bin', 'int'],
expected_type_codes=[0, 1])
expected_type_codes=[0, 1],
expected_type_code_values=types)

# with type codes
check_result(pa.UnionArray.from_dense(logical_types, value_offsets,
[binary, int64],
type_codes=[11, 13]),
expected_field_names=['0', '1'],
expected_type_codes=[11, 13])
expected_type_codes=[11, 13],
expected_type_code_values=logical_types)

# with field names and type codes
check_result(pa.UnionArray.from_dense(logical_types, value_offsets,
[binary, int64],
['bin', 'int'], [11, 13]),
expected_field_names=['bin', 'int'],
expected_type_codes=[11, 13])
expected_type_codes=[11, 13],
expected_type_code_values=logical_types)

# Bad type ids
arr = pa.UnionArray.from_dense(logical_types, value_offsets,
Expand All @@ -799,37 +813,52 @@ def test_union_from_sparse():
logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8')
py_value = [b'a', 1, b'b', b'c', 2, 3, b'd']

def check_result(result, expected_field_names, expected_type_codes):
def check_result(result, expected_field_names, expected_type_codes,
expected_type_code_values):
result.validate(full=True)
assert result.to_pylist() == py_value
actual_field_names = [result.type[i].name
for i in range(result.type.num_children)]
assert actual_field_names == expected_field_names
assert result.type.mode == "sparse"
assert result.type.type_codes == expected_type_codes
assert expected_type_code_values.equals(result.type_codes)
assert result.child(0).equals(binary)
assert result.child(1).equals(int64)
with pytest.raises(pa.ArrowTypeError):
result.offsets
with pytest.raises(KeyError):
result.child(-1)
with pytest.raises(KeyError):
result.child(2)

# without field names and type codes
check_result(pa.UnionArray.from_sparse(types, [binary, int64]),
expected_field_names=['0', '1'],
expected_type_codes=[0, 1])
expected_type_codes=[0, 1],
expected_type_code_values=types)

# with field names
check_result(pa.UnionArray.from_sparse(types, [binary, int64],
['bin', 'int']),
expected_field_names=['bin', 'int'],
expected_type_codes=[0, 1])
expected_type_codes=[0, 1],
expected_type_code_values=types)

# with type codes
check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64],
type_codes=[11, 13]),
expected_field_names=['0', '1'],
expected_type_codes=[11, 13])
expected_type_codes=[11, 13],
expected_type_code_values=logical_types)

# with field names and type codes
check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64],
['bin', 'int'],
[11, 13]),
expected_field_names=['bin', 'int'],
expected_type_codes=[11, 13])
expected_type_codes=[11, 13],
expected_type_code_values=logical_types)

# Bad type ids
arr = pa.UnionArray.from_sparse(logical_types, [binary, int64])
Expand Down

0 comments on commit a43199b

Please sign in to comment.