Skip to content

Commit

Permalink
ARROW-1177: [C++] Check for int32 offset overflow in ListBuilder, Bin…
Browse files Browse the repository at this point in the history
…aryBuilder

I also refactored BinaryBuilder to not inherit from ListBuilder, which is a bit cleaner. I added a draft of ARROW-507; it needs a unit test and to handle the case where some passed offsets are null (so they need to be sanitized)

Author: Wes McKinney <[email protected]>

Closes apache#853 from wesm/ARROW-1177 and squashes the following commits:

f6be04f [Wes McKinney] Fix DCHECKs in ListBuilder, BinaryBuilder
28f17ab [Wes McKinney] Use binary strings for py2.7
c9e7502 [Wes McKinney] Fix some off-by-one errors
5a8be84 [Wes McKinney] Fix another warning
23adefc [Wes McKinney] Fix compiler warning
35ab4f2 [Wes McKinney] Refactoring BinaryBuilder. Add check for int32 offset overflow for List, Binary, String. Add basic ListArray::FromArrays method, add Python binding
  • Loading branch information
wesm authored and xhochy committed Jul 17, 2017
1 parent 0396240 commit 1541a08
Show file tree
Hide file tree
Showing 13 changed files with 313 additions and 131 deletions.
11 changes: 6 additions & 5 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,14 @@ Status MakeArrayFromValidBytes(
std::shared_ptr<Buffer> null_buf;
RETURN_NOT_OK(BitUtil::BytesToBits(v, &null_buf));

BufferBuilder value_builder(pool);
TypedBufferBuilder<int32_t> value_builder(pool);
for (size_t i = 0; i < v.size(); ++i) {
RETURN_NOT_OK(value_builder.Append<int32_t>(0));
RETURN_NOT_OK(value_builder.Append(0));
}

*out = std::make_shared<Int32Array>(
v.size(), value_builder.Finish(), null_buf, null_count);
std::shared_ptr<Buffer> values;
RETURN_NOT_OK(value_builder.Finish(&values));
*out = std::make_shared<Int32Array>(v.size(), values, null_buf, null_count);
return Status::OK();
}

Expand Down Expand Up @@ -996,7 +997,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
vector<uint8_t> is_null = {0, 0, 0, 1, 0};

int N = static_cast<int>(strings.size());
int reps = 1000;
int reps = 10;

for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,32 @@ ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
SetData(internal_data);
}

Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out) {
if (ARROW_PREDICT_FALSE(offsets.length() == 0)) {
return Status::Invalid("List offsets must have non-zero length");
}

if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) {
return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented");
}

if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) {
return Status::Invalid("List offsets must be signed int32");
}

BufferVector buffers = {
offsets.null_bitmap(), static_cast<const Int32Array&>(offsets).values()};

auto list_type = list(values.type());
auto internal_data = std::make_shared<internal::ArrayData>(list_type,
offsets.length() - 1, std::move(buffers), offsets.null_count(), offsets.offset());
internal_data->child_data.push_back(values.data());

*out = std::make_shared<ListArray>(internal_data);
return Status::OK();
}

void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
auto value_offsets = data->buffers[1];
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,19 @@ class ARROW_EXPORT ListArray : public Array {
const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0,
int64_t offset = 0);

/// \brief Construct ListArray from array of offsets and child value array
///
/// Note: does not validate input beyond sanity checks. Use
/// arrow::ValidateArray if you need stronger validation of inputs
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and size
/// \param[in] values Array containing
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[out] out Will have length equal to offsets.length() - 1
static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out);

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;

Expand Down
73 changes: 43 additions & 30 deletions cpp/src/arrow/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,58 +212,71 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}

template <typename T>
// Unsafe methods don't check existing size
void UnsafeAppend(const uint8_t* data, int64_t length) {
memcpy(data_ + size_, data, static_cast<size_t>(length));
size_ += length;
}

Status Finish(std::shared_ptr<Buffer>* out) {
// Do not shrink to fit to avoid unneeded realloc
if (size_ > 0) { RETURN_NOT_OK(buffer_->Resize(size_, false)); }
*out = buffer_;
Reset();
return Status::OK();
}

void Reset() {
buffer_ = nullptr;
capacity_ = size_ = 0;
}

int64_t capacity() const { return capacity_; }
int64_t length() const { return size_; }
const uint8_t* data() const { return data_; }

protected:
std::shared_ptr<PoolBuffer> buffer_;
MemoryPool* pool_;
uint8_t* data_;
int64_t capacity_;
int64_t size_;
};

template <typename T>
class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
public:
explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {}

Status Append(T arithmetic_value) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
return Append(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
return BufferBuilder::Append(
reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
}

template <typename T>
Status Append(const T* arithmetic_values, int64_t num_elements) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
return Append(
return BufferBuilder::Append(
reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
}

// Unsafe methods don't check existing size
void UnsafeAppend(const uint8_t* data, int64_t length) {
memcpy(data_ + size_, data, static_cast<size_t>(length));
size_ += length;
}

template <typename T>
void UnsafeAppend(T arithmetic_value) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
BufferBuilder::UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T));
}

template <typename T>
void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) {
static_assert(std::is_arithmetic<T>::value,
"Convenience buffer append only supports arithmetic types");
UnsafeAppend(
BufferBuilder::UnsafeAppend(
reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T));
}

std::shared_ptr<Buffer> Finish() {
auto result = buffer_;
buffer_ = nullptr;
capacity_ = size_ = 0;
return result;
}
int64_t capacity() const { return capacity_; }
int64_t length() const { return size_; }
const uint8_t* data() const { return data_; }

private:
std::shared_ptr<PoolBuffer> buffer_;
MemoryPool* pool_;
uint8_t* data_;
int64_t capacity_;
int64_t size_;
const T* data() const { return reinterpret_cast<const T*>(data_); }
int64_t length() const { return size_ / sizeof(T); }
};

/// Allocate a new mutable buffer from a memory pool
Expand Down
Loading

0 comments on commit 1541a08

Please sign in to comment.