Skip to content

Commit

Permalink
ARROW-6775: [C++][Python] Implement list_value_lengths and list_paren…
Browse files Browse the repository at this point in the history
…t_indices functions

This adds two functions that operate on list types:

* list_value_lengths: returns an int32 (for List) or int64 (for LargeList) array with the number of elements in each list value slot
* list_parent_indices: returns an int32/int64 array with the same length as the child values array of a List type where each value is the index of the list "slot" containing each child value

Closes apache#7632 from wesm/some-list-functions

Lead-authored-by: Wes McKinney <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
wesm and pitrou committed Jul 7, 2020
1 parent 9283be7 commit e1c3334
Show file tree
Hide file tree
Showing 12 changed files with 244 additions and 5 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_cast_string.cc
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
compute/kernels/scalar_nested.cc
compute/kernels/scalar_set_lookup.cc
compute/kernels/scalar_string.cc
compute/kernels/scalar_validity.cc
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_arrow_compute_test(scalar_test
scalar_boolean_test.cc
scalar_cast_test.cc
scalar_compare_test.cc
scalar_nested_test.cc
scalar_set_lookup_test.cc
scalar_string_test.cc
scalar_validity_test.cc
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_nested.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Vector kernels involving nested types

#include "arrow/array/array_base.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/result.h"
#include "arrow/visitor_inline.h"

namespace arrow {
namespace compute {
namespace internal {
namespace {

template <typename Type, typename offset_type = typename Type::offset_type>
void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
using ScalarType = typename TypeTraits<Type>::ScalarType;
using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;

if (batch[0].kind() == Datum::ARRAY) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();
auto out_values = out_arr->GetMutableValues<offset_type>(1);
const offset_type* offsets = list.raw_value_offsets();
::arrow::internal::detail::VisitBitBlocksVoid(
list.data()->buffers[0], list.offset(), list.length(),
[&](int64_t position) {
*out_values++ = offsets[position + 1] - offsets[position];
},
[&]() { *out_values++ = 0; });
} else {
const auto& arg0 = batch[0].scalar_as<ScalarType>();
if (arg0.is_valid) {
checked_cast<OffsetScalarType*>(out->scalar().get())->value =
static_cast<offset_type>(arg0.value->length());
}
}
}

} // namespace

void RegisterScalarNested(FunctionRegistry* registry) {
auto list_value_lengths =
std::make_shared<ScalarFunction>("list_value_lengths", Arity::Unary());
DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(),
ListValueLengths<ListType>));
DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
ListValueLengths<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_value_lengths)));
}

} // namespace internal
} // namespace compute
} // namespace arrow
40 changes: 40 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_nested_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include "arrow/compute/api.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"

namespace arrow {
namespace compute {

static std::shared_ptr<DataType> GetOffsetType(const DataType& type) {
return type.id() == Type::LIST ? int32() : int64();
}

TEST(TestScalarNested, ListValueLengths) {
for (auto ty : {list(int32()), large_list(int32())}) {
CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3], []]",
GetOffsetType(*ty), "[3, null, 2, 0]");
}
}

} // namespace compute
} // namespace arrow
43 changes: 38 additions & 5 deletions cpp/src/arrow/compute/kernels/vector_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
namespace arrow {
namespace compute {
namespace internal {
namespace {

template <typename Type>
void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand All @@ -36,19 +37,51 @@ void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
out->value = (*result)->data();
}

static Result<ValueDescr> ValuesType(KernelContext*,
const std::vector<ValueDescr>& args) {
template <typename Type, typename offset_type = typename Type::offset_type>
void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();

const offset_type* offsets = list.raw_value_offsets();
offset_type values_length = offsets[list.length()] - offsets[0];

out_arr->length = values_length;
out_arr->null_count = 0;
KERNEL_ASSIGN_OR_RAISE(out_arr->buffers[1], ctx,
ctx->Allocate(values_length * sizeof(offset_type)));
auto out_indices = reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
for (int64_t i = 0; i < list.length(); ++i) {
// Note: In most cases, null slots are empty, but when they are non-empty
// we write out the indices so make sure they are accounted for. This
// behavior could be changed if needed in the future.
for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) {
*out_indices++ = static_cast<offset_type>(i);
}
}
}

Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& args) {
const auto& list_type = checked_cast<const BaseListType&>(*args[0].type);
return ValueDescr::Array(list_type.value_type());
}

} // namespace

void RegisterVectorNested(FunctionRegistry* registry) {
auto flatten = std::make_shared<VectorFunction>("list_flatten", Arity::Unary());
DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType),
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType),
ListFlatten<ListType>));
DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)}, OutputType(ValuesType),
ListFlatten<LargeListType>));
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
OutputType(ValuesType), ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));

auto list_parent_indices =
std::make_shared<VectorFunction>("list_parent_indices", Arity::Unary());
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(),
ListParentIndices<ListType>));
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(),
ListParentIndices<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_parent_indices)));
}

} // namespace internal
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_nested_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,25 @@ TEST(TestVectorNested, ListFlatten) {
}
}

TEST(TestVectorNested, ListParentIndices) {
for (auto ty : {list(int32()), large_list(int32())}) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");

auto out_ty = ty->id() == Type::LIST ? int32() : int64();
auto expected = ArrayFromJSON(out_ty, "[0, 0, 0, 2, 2, 4, 4]");
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {input}));
AssertArraysEqual(*expected, *out.make_array());
}

// Construct a list with non-empty null slots
auto input = ArrayFromJSON(list(int32()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]");
std::shared_ptr<ArrayData> data = input->data()->Copy();
data->buffers[0] =
(ArrayFromJSON(boolean(), "[true, false, true, true, true]")->data()->buffers[1]);
auto expected = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {data}));
AssertArraysEqual(*expected, *out.make_array());
}

} // namespace compute
} // namespace arrow
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
RegisterScalarBoolean(registry.get());
RegisterScalarCast(registry.get());
RegisterScalarComparison(registry.get());
RegisterScalarNested(registry.get());
RegisterScalarSetLookup(registry.get());
RegisterScalarStringAscii(registry.get());
RegisterScalarValidity(registry.get());
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry);
void RegisterScalarBoolean(FunctionRegistry* registry);
void RegisterScalarCast(FunctionRegistry* registry);
void RegisterScalarComparison(FunctionRegistry* registry);
void RegisterScalarNested(FunctionRegistry* registry);
void RegisterScalarSetLookup(FunctionRegistry* registry);
void RegisterScalarStringAscii(FunctionRegistry* registry);
void RegisterScalarValidity(FunctionRegistry* registry);
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ struct TypeTraits<ListType> {
using OffsetType = Int32Type;
using OffsetArrayType = Int32Array;
using OffsetBuilderType = Int32Builder;
using OffsetScalarType = Int32Scalar;
constexpr static bool is_parameter_free = false;
};

Expand All @@ -318,6 +319,7 @@ struct TypeTraits<LargeListType> {
using OffsetType = Int64Type;
using OffsetArrayType = Int64Array;
using OffsetBuilderType = Int64Builder;
using OffsetScalarType = Int64Scalar;
constexpr static bool is_parameter_free = false;
};

Expand Down
41 changes: 41 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,47 @@ cdef class BaseListArray(Array):
"""
return _pc().list_flatten(self)

def value_parent_indices(self):
"""
Return array of same length as list child values array where each
output value is the index of the parent list array slot containing each
child value.
Examples
--------
>>> arr = pa.array([[1, 2, 3], [], None, [4]],
... type=pa.list_(pa.int32()))
>>> arr.value_parent_indices()
<pyarrow.lib.Int32Array object at 0x7efc5db958a0>
[
0,
0,
0,
3
]
"""
return _pc().list_parent_indices(self)

def value_lengths(self):
"""
Return integers array with values equal to the respective length of
each list element. Null list values are null in the output.
Examples
--------
>>> arr = pa.array([[1, 2, 3], [], None, [4]],
... type=pa.list_(pa.int32()))
>>> arr.value_lengths()
<pyarrow.lib.Int32Array object at 0x7efc5db95910>
[
3,
0,
null,
1
]
"""
return _pc().list_value_lengths(self)


cdef class ListArray(BaseListArray):
"""
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def func(left, right):
is_null = _simple_unary_function('is_null')

list_flatten = _simple_unary_function('list_flatten')
list_parent_indices = _simple_unary_function('list_parent_indices')
list_value_lengths = _simple_unary_function('list_value_lengths')

add = _simple_binary_function('add')
subtract = _simple_binary_function('subtract')
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,34 @@ def test_list_array_flatten(offset_type, list_type_factory):
assert arr2.values.values.equals(arr0)


@pytest.mark.parametrize(('offset_type', 'list_type_factory'),
[(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
def test_list_value_parent_indices(offset_type, list_type_factory):
arr = pa.array(
[
[0, 1, 2],
None,
[],
[3, 4]
], type=list_type_factory(pa.int32()))
expected = pa.array([0, 0, 0, 3, 3], type=offset_type)
assert arr.value_parent_indices().equals(expected)


@pytest.mark.parametrize(('offset_type', 'list_type_factory'),
[(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
def test_list_value_lengths(offset_type, list_type_factory):
arr = pa.array(
[
[0, 1, 2],
None,
[],
[3, 4]
], type=list_type_factory(pa.int32()))
expected = pa.array([3, None, 0, 2], type=offset_type)
assert arr.value_lengths().equals(expected)


@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
def test_list_array_flatten_non_canonical(list_type_factory):
# Non-canonical list array (null elements backed by non-empty sublists)
Expand Down

0 comments on commit e1c3334

Please sign in to comment.