Skip to content

Commit

Permalink
apacheGH-39798: [C++] Optimize Take for fixed-size types including ne…
Browse files Browse the repository at this point in the history
…sted fixed-size lists (apache#41297)

### Rationale for this change

Introduce utilities for dealing with fixed-width types (including fixed-size lists of fixed-width types) generically. And use it for initial optimizations of `Take` and `Filter`.

### What changes are included in this PR?

- [x] Introduce utilities for dealing with fixed-width types generically
- [x] Use faster `Take` kernel on small power-of-2 byte widths of fixed-width types
  - [x] from `FSLTakeExec` (including FSLs of FSBs)
  - [x] from `FSBTakeExec` (done before this PR)
- [x] ~Take on any fixed-width type~ (as a separate issue apache#41301)
- [x] Use faster `Filter` kernel on both primitive and fixed-width types of any length
   - [x] from `FSLFilterExec` (including FSLs of FSBs)
   - [x] from `FSBFilterExec` (done before this PR)

### Are these changes tested?

By existing and new tests.

### Are there any user-facing changes?

Some functions added to the `arrow::util` namespace and documented inline.
* GitHub Issue: apache#39798

Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
  • Loading branch information
felipecrv authored May 3, 2024
1 parent 71e38fc commit 9749d7d
Show file tree
Hide file tree
Showing 11 changed files with 1,171 additions and 72 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS
util/decimal.cc
util/delimiting.cc
util/dict_util.cc
util/fixed_width_internal.cc
util/float16.cc
util/formatting.cc
util/future.cc
Expand Down
30 changes: 17 additions & 13 deletions cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/fixed_width_internal.h"

namespace arrow {

Expand Down Expand Up @@ -158,9 +159,11 @@ class PrimitiveFilterImpl {
PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
FilterOptions::NullSelectionBehavior null_selection,
ArrayData* out_arr)
: byte_width_(values.type->byte_width()),
: byte_width_(util::FixedWidthInBytes(*values.type)),
values_is_valid_(values.buffers[0].data),
values_data_(values.buffers[1].data),
// No offset applied for boolean because it's a bitmap
values_data_(kIsBoolean ? values.buffers[1].data
: util::OffsetPointerOfFixedWidthValues(values)),
values_null_count_(values.null_count),
values_offset_(values.offset),
values_length_(values.length),
Expand All @@ -169,17 +172,13 @@ class PrimitiveFilterImpl {
if constexpr (kByteWidth >= 0 && !kIsBoolean) {
DCHECK_EQ(kByteWidth, byte_width_);
}
if constexpr (!kIsBoolean) {
// No offset applied for boolean because it's a bitmap
values_data_ += values.offset * byte_width();
}

DCHECK_EQ(out_arr->offset, 0);
if (out_arr->buffers[0] != nullptr) {
// May be unallocated if neither filter nor values contain nulls
out_is_valid_ = out_arr->buffers[0]->mutable_data();
}
out_data_ = out_arr->buffers[1]->mutable_data();
DCHECK_EQ(out_arr->offset, 0);
out_data_ = util::MutableFixedWidthValuesPointer(out_arr);
out_length_ = out_arr->length;
out_position_ = 0;
}
Expand Down Expand Up @@ -416,7 +415,7 @@ class PrimitiveFilterImpl {
out_position_ += length;
}

constexpr int32_t byte_width() const {
constexpr int64_t byte_width() const {
if constexpr (kByteWidth >= 0) {
return kByteWidth;
} else {
Expand All @@ -425,7 +424,7 @@ class PrimitiveFilterImpl {
}

private:
int32_t byte_width_;
int64_t byte_width_;
const uint8_t* values_is_valid_;
const uint8_t* values_data_;
int64_t values_null_count_;
Expand All @@ -439,6 +438,8 @@ class PrimitiveFilterImpl {
int64_t out_position_;
};

} // namespace

Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;
const ArraySpan& filter = batch[1].array;
Expand Down Expand Up @@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
// validity bitmap.
const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero;

const int bit_width = values.type->bit_width();
RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width,
allocate_validity, out_arr));
DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false));
const int64_t bit_width = util::FixedWidthInBits(*values.type);
RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
ctx, output_length, /*source=*/values, allocate_validity, out_arr));

switch (bit_width) {
case 1:
Expand Down Expand Up @@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
return Status::OK();
}

namespace {

// ----------------------------------------------------------------------
// Optimized filter for base binary types (32-bit and 64-bit)

Expand Down
56 changes: 38 additions & 18 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/fixed_width_internal.h"
#include "arrow/util/int_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/ree_util.h"
Expand Down Expand Up @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}

Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
bool allocate_validity, ArrayData* out) {
// Preallocate memory
out->length = length;
out->buffers.resize(2);

if (allocate_validity) {
ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
}
if (bit_width == 1) {
ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
} else {
ARROW_ASSIGN_OR_RAISE(out->buffers[1],
ctx->Allocate(bit_util::BytesForBits(length * bit_width)));
}
return Status::OK();
}

namespace {

/// \brief Iterate over a REE filter, emitting ranges of a plain values array that
Expand Down Expand Up @@ -909,6 +892,20 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
}

Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;

// If a FixedSizeList wraps a fixed-width type we can, in some cases, use
// PrimitiveFilterExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec
// might not handle it correctly.
if (byte_width > 0) {
return PrimitiveFilterExec(ctx, batch, out);
}
}
return FilterExec<FSLSelectionImpl>(ctx, batch, out);
}

Expand Down Expand Up @@ -968,6 +965,29 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
}

Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;

// If a FixedSizeList wraps a fixed-width type we can, in some cases, use
// PrimitiveTakeExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// Additionally, PrimitiveTakeExec is only implemented for specific byte widths.
// TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type.
switch (byte_width) {
case 1:
case 2:
case 4:
case 8:
case 16:
case 32:
return PrimitiveTakeExec(ctx, batch, out);
default:
break; // fallback to TakeExec<FSBSelectionImpl>
}
}

return TakeExec<FSLSelectionImpl>(ctx, batch, out);
}

Expand Down
7 changes: 1 addition & 6 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
const FunctionOptions* default_options,
FunctionRegistry* registry);

/// \brief Allocate an ArrayData for a primitive array with a given length and bit width
///
/// \param[in] bit_width 1 or a multiple of 8
Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
bool allocate_validity, ArrayData* out);

/// \brief Callback type for VisitPlainxREEFilterOutputSegments.
///
/// position is the logical position in the values array relative to its offset.
Expand All @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments(
FilterOptions::NullSelectionBehavior null_selection,
const EmitREEFilterSegment& emit_segment);

Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Expand Down
39 changes: 23 additions & 16 deletions cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/fixed_width_internal.h"
#include "arrow/util/int_util.h"
#include "arrow/util/ree_util.h"

Expand Down Expand Up @@ -323,7 +324,7 @@ namespace {
using TakeState = OptionsWrapper<TakeOptions>;

// ----------------------------------------------------------------------
// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte
// C-type based types. Use common implementation for every byte width and only
// generate code for unsigned integer indices, since after boundschecking to
// check for negative numbers in the indices we can safely reinterpret_cast
Expand All @@ -333,33 +334,36 @@ using TakeState = OptionsWrapper<TakeOptions>;
/// use the logical Arrow type but rather the physical C type. This way we
/// only generate one take function for each byte width.
///
/// This function assumes that the indices have been boundschecked.
/// Also note that this function can also handle fixed-size-list arrays if
/// they fit the criteria described in fixed_width_internal.h, so use the
/// function defined in that file to access values and destination pointers
/// and DO NOT ASSUME `values.type()` is a primitive type.
///
/// \pre the indices have been boundschecked
template <typename IndexCType, typename ValueWidthConstant>
struct PrimitiveTakeImpl {
static constexpr int kValueWidth = ValueWidthConstant::value;

static void Exec(const ArraySpan& values, const ArraySpan& indices,
ArrayData* out_arr) {
DCHECK_EQ(values.type->byte_width(), kValueWidth);
const auto* values_data =
values.GetValues<uint8_t>(1, 0) + kValueWidth * values.offset;
DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth);
const auto* values_data = util::OffsetPointerOfFixedWidthValues(values);
const uint8_t* values_is_valid = values.buffers[0].data;
auto values_offset = values.offset;

const auto* indices_data = indices.GetValues<IndexCType>(1);
const uint8_t* indices_is_valid = indices.buffers[0].data;
auto indices_offset = indices.offset;

auto out = out_arr->GetMutableValues<uint8_t>(1, 0) + kValueWidth * out_arr->offset;
DCHECK_EQ(out_arr->offset, 0);
auto* out = util::MutableFixedWidthValuesPointer(out_arr);
auto out_is_valid = out_arr->buffers[0]->mutable_data();
auto out_offset = out_arr->offset;
DCHECK_EQ(out_offset, 0);

// If either the values or indices have nulls, we preemptively zero out the
// out validity bitmap so that we don't have to use ClearBit in each
// iteration for nulls.
if (values.null_count != 0 || indices.null_count != 0) {
bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
bit_util::SetBitsTo(out_is_valid, 0, indices.length, false);
}

auto WriteValue = [&](int64_t position) {
Expand All @@ -386,7 +390,7 @@ struct PrimitiveTakeImpl {
valid_count += block.popcount;
if (block.popcount == block.length) {
// Fastest path: neither values nor index nulls
bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
bit_util::SetBitsTo(out_is_valid, position, block.length, true);
for (int64_t i = 0; i < block.length; ++i) {
WriteValue(position);
++position;
Expand All @@ -396,7 +400,7 @@ struct PrimitiveTakeImpl {
for (int64_t i = 0; i < block.length; ++i) {
if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
// index is not null
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
WriteValue(position);
} else {
WriteZero(position);
Expand All @@ -416,7 +420,7 @@ struct PrimitiveTakeImpl {
values_offset + indices_data[position])) {
// value is not null
WriteValue(position);
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else {
WriteZero(position);
Expand All @@ -433,7 +437,7 @@ struct PrimitiveTakeImpl {
values_offset + indices_data[position])) {
// index is not null && value is not null
WriteValue(position);
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else {
WriteZero(position);
Expand Down Expand Up @@ -584,14 +588,17 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*

ArrayData* out_arr = out->array_data().get();

const int bit_width = values.type->bit_width();
DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false,
/*exclude_dictionary=*/true));
const int64_t bit_width = util::FixedWidthInBits(*values.type);

// TODO: When neither values nor indices contain nulls, we can skip
// allocating the validity bitmap altogether and save time and space. A
// streamlined PrimitiveTakeImpl would need to be written that skips all
// interactions with the output validity bitmap, though.
RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width,
/*allocate_validity=*/true, out_arr));
RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
ctx, indices.length, /*source=*/values,
/*allocate_validity=*/true, out_arr));
switch (bit_width) {
case 1:
TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
Expand Down
Loading

0 comments on commit 9749d7d

Please sign in to comment.