Skip to content

Commit

Permalink
ARROW-1207: [C++] Implement MapArray, MapBuilder, MapType classes, an…
Browse files Browse the repository at this point in the history
…d IPC support

Implements `MapArray` as a subclass of `ListArray`, where each `value` in the list is a `key: item` pair. (This naming is not the most natural, but `value` is taken.)

`MapType::keys_sorted()` is currently stored but unused- for example `MapBuilder` does not check inserted keys for correct ordering. `MapType` is printed as `map<utf8, int32>` and `map<int32, float64, keys_sorted>` for unsorted, sorted keys respectively.

Map arrays are created with `ArrayFromJSON` by providing for each pair an array of length 2 containing the key and the mapped item [(example)](https://github.com/apache/arrow/compare/master...bkietz:1207-Implement-Map-logical-type?expand=1#diff-015ed4b6849ed6e64e25bba42aa1d29eR572).

Author: Benjamin Kietzman <[email protected]>

Closes apache#4352 from bkietz/1207-Implement-Map-logical-type and squashes the following commits:

9fb8700 <Benjamin Kietzman> explicitly disable map in flight test
41b3016 <Benjamin Kietzman> more cleanup, disable JS ipc tests as well
1b74aa1 <Benjamin Kietzman> disable map IPC tests for Java
a0de551 <Benjamin Kietzman> cleanup of code which assumes map has 2 children
2aaab29 <Benjamin Kietzman> ListType isa MapType
9b455e7 <Benjamin Kietzman> Add IPC tests for Map
62dade0 <Benjamin Kietzman> remove redundant null check
a3be934 <Benjamin Kietzman> add tests using and validating MapBuilder
c936ebd <Benjamin Kietzman> fix MapScalar typos
1047a6d <Benjamin Kietzman> run clang-format
31930ff <Benjamin Kietzman> de-inline MapBuilder constructor
eb6db03 <Benjamin Kietzman> set keys_, items_
a5c88a1 <Benjamin Kietzman> fix: obj_ is not a pointer
8049c51 <Benjamin Kietzman> MapArray isa ListArray
4c11db9 <Benjamin Kietzman> adding some tests and filling out Map*
f89da94 <Benjamin Kietzman> first pass at MapArray, MapBuilder, MapScalar
7fbbe70 <Benjamin Kietzman> add checked_pointer_cast for unique_ptr
5e727e5 <Benjamin Kietzman> add map() type factory
e9b34d0 <Benjamin Kietzman> Add keysSorted field
01214fb <Benjamin Kietzman> add MapType and test its ToString
47d95ef <Benjamin Kietzman> add MapType to Layout.rst
  • Loading branch information
bkietz authored and pitrou committed Jun 11, 2019
1 parent 71bcfdf commit dede1e6
Show file tree
Hide file tree
Showing 34 changed files with 1,134 additions and 47 deletions.
146 changes: 146 additions & 0 deletions cpp/src/arrow/array-list-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
namespace arrow {

using internal::checked_cast;
using internal::checked_pointer_cast;

// ----------------------------------------------------------------------
// List tests
Expand Down Expand Up @@ -340,6 +341,151 @@ TEST_F(TestListArray, TestBuilderPreserveFieleName) {
ASSERT_EQ("counts", type.value_field()->name());
}

// ----------------------------------------------------------------------
// Map tests

class TestMapArray : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();

key_type_ = utf8();
value_type_ = int32();
type_ = map(key_type_, value_type_);

std::unique_ptr<ArrayBuilder> tmp;
ASSERT_OK(MakeBuilder(pool_, type_, &tmp));
builder_ = checked_pointer_cast<MapBuilder>(std::move(tmp));
}

void Done() {
std::shared_ptr<Array> out;
FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<MapArray>(out);
}

protected:
std::shared_ptr<DataType> value_type_, key_type_;

std::shared_ptr<MapBuilder> builder_;
std::shared_ptr<MapArray> result_;
};

TEST_F(TestMapArray, Equality) {
auto& kb = checked_cast<StringBuilder&>(*builder_->key_builder());
auto& ib = checked_cast<Int32Builder&>(*builder_->item_builder());

std::shared_ptr<Array> array, equal_array, unequal_array;
std::vector<int32_t> equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10};
std::vector<util::string_view> equal_keys = {"a", "a", "a", "b", "c",
"a", "a", "a", "a", "b"};
std::vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
std::vector<int32_t> unequal_offsets = {0, 1, 4, 7};
std::vector<util::string_view> unequal_keys = {"a", "a", "b", "c", "a", "b", "c"};
std::vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};

// setup two equal arrays
for (auto out : {&array, &equal_array}) {
ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size()));
for (auto&& key : equal_keys) {
ASSERT_OK(kb.Append(key));
}
ASSERT_OK(ib.AppendValues(equal_values.data(), equal_values.size()));
ASSERT_OK(builder_->Finish(out));
}

// now an unequal one
ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size()));
for (auto&& key : unequal_keys) {
ASSERT_OK(kb.Append(key));
}
ASSERT_OK(ib.AppendValues(unequal_values.data(), unequal_values.size()));
ASSERT_OK(builder_->Finish(&unequal_array));

// Test array equality
EXPECT_TRUE(array->Equals(array));
EXPECT_TRUE(array->Equals(equal_array));
EXPECT_TRUE(equal_array->Equals(array));
EXPECT_FALSE(equal_array->Equals(unequal_array));
EXPECT_FALSE(unequal_array->Equals(equal_array));

// Test range equality
EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
}

TEST_F(TestMapArray, BuildingIntToInt) {
auto type = map(int16(), int16());

auto expected_keys = ArrayFromJSON(int16(), R"([
0, 1, 2, 3, 4, 5,
0, 1, 2, 3, 4, 5
])");
auto expected_items = ArrayFromJSON(int16(), R"([
1, 1, 2, 3, 5, 8,
null, null, 0, 1, null, 2
])");
auto expected_offsets = ArrayFromJSON(int32(), "[0, 6, 6, 12, 12]")->data()->buffers[1];
auto expected_null_bitmap =
ArrayFromJSON(boolean(), "[1, 0, 1, 1]")->data()->buffers[1];

MapArray expected(type, 4, expected_offsets, expected_keys, expected_items,
expected_null_bitmap, 1, 0);

auto key_builder = std::make_shared<Int16Builder>();
auto item_builder = std::make_shared<Int16Builder>();
MapBuilder map_builder(default_memory_pool(), key_builder, item_builder);

std::shared_ptr<Array> actual;
ASSERT_OK(map_builder.Append());
ASSERT_OK(key_builder->AppendValues({0, 1, 2, 3, 4, 5}));
ASSERT_OK(item_builder->AppendValues({1, 1, 2, 3, 5, 8}));
ASSERT_OK(map_builder.AppendNull());
ASSERT_OK(map_builder.Append());
ASSERT_OK(key_builder->AppendValues({0, 1, 2, 3, 4, 5}));
ASSERT_OK(item_builder->AppendValues({-1, -1, 0, 1, -1, 2}, {0, 0, 1, 1, 0, 1}));
ASSERT_OK(map_builder.Append());
ASSERT_OK(map_builder.Finish(&actual));
ASSERT_OK(ValidateArray(*actual));

ASSERT_ARRAYS_EQUAL(*actual, expected);
}

TEST_F(TestMapArray, BuildingStringToInt) {
auto type = map(utf8(), int32());

std::vector<int32_t> offsets = {0, 2, 2, 3, 3};
auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])");
auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]");
std::shared_ptr<Buffer> expected_null_bitmap;
ASSERT_OK(
BitUtil::BytesToBits({1, 0, 1, 1}, default_memory_pool(), &expected_null_bitmap));
MapArray expected(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values,
expected_null_bitmap, 1);

auto key_builder = std::make_shared<StringBuilder>();
auto item_builder = std::make_shared<Int32Builder>();
MapBuilder map_builder(default_memory_pool(), key_builder, item_builder);

std::shared_ptr<Array> actual;
ASSERT_OK(map_builder.Append());
ASSERT_OK(key_builder->Append("joe"));
ASSERT_OK(item_builder->Append(0));
ASSERT_OK(key_builder->Append("mark"));
ASSERT_OK(item_builder->AppendNull());
ASSERT_OK(map_builder.AppendNull());
ASSERT_OK(map_builder.Append());
ASSERT_OK(key_builder->Append("cap"));
ASSERT_OK(item_builder->Append(8));
ASSERT_OK(map_builder.Append());
ASSERT_OK(map_builder.Finish(&actual));
ASSERT_OK(ValidateArray(*actual));

ASSERT_ARRAYS_EQUAL(*actual, expected);
}

// ----------------------------------------------------------------------
// FixedSizeList tests

Expand Down
89 changes: 81 additions & 8 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,7 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
// ----------------------------------------------------------------------
// ListArray

ListArray::ListArray(const std::shared_ptr<ArrayData>& data) {
DCHECK_EQ(data->type->id(), Type::LIST);
SetData(data);
}
ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
Expand Down Expand Up @@ -275,6 +272,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
DCHECK_EQ(data->buffers.size(), 2);
DCHECK(data->type->id() == Type::LIST);
list_type_ = checked_cast<const ListType*>(data->type.get());

auto value_offsets = data->buffers[1];
raw_value_offsets_ = value_offsets == nullptr
Expand All @@ -285,16 +284,47 @@ void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
values_ = MakeArray(data_->child_data[0]);
}

const ListType* ListArray::list_type() const {
return checked_cast<const ListType*>(data_->type.get());
}

std::shared_ptr<DataType> ListArray::value_type() const {
return list_type()->value_type();
}

std::shared_ptr<Array> ListArray::values() const { return values_; }

// ----------------------------------------------------------------------
// MapArray

MapArray::MapArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
auto pair_data = ArrayData::Make(type->children()[0]->type(), keys->data()->length,
{nullptr}, {keys->data(), values->data()}, 0, offset);
auto map_data = ArrayData::Make(type, length, {null_bitmap, offsets}, {pair_data},
null_count, offset);
SetData(map_data);
}

void MapArray::SetData(const std::shared_ptr<ArrayData>& data) {
DCHECK_EQ(data->type->id(), Type::MAP);
auto pair_data = data->child_data[0];
DCHECK_EQ(pair_data->type->id(), Type::STRUCT);
DCHECK_EQ(pair_data->null_count, 0);
DCHECK_EQ(pair_data->child_data.size(), 2);
DCHECK_EQ(pair_data->child_data[0]->null_count, 0);

auto pair_list_data = data->Copy();
pair_list_data->type = list(pair_data->type);
this->ListArray::SetData(pair_list_data);
data_->type = data->type;

keys_ = MakeArray(pair_data->child_data[0]);
items_ = MakeArray(pair_data->child_data[1]);
}

// ----------------------------------------------------------------------
// FixedSizeListArray

Expand Down Expand Up @@ -904,6 +934,49 @@ struct ValidateVisitor {
return ValidateOffsets(array);
}

Status Visit(const MapArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}

auto value_offsets = array.value_offsets();
if (array.length() && !value_offsets) {
return Status::Invalid("value_offsets_ was null");
}
if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
" isn't large enough for length: ", array.length());
}

if (!array.keys()) {
return Status::Invalid("keys was null");
}
const Status key_valid = ValidateArray(*array.values());
if (!key_valid.ok()) {
return Status::Invalid("key array invalid: ", key_valid.ToString());
}

if (!array.values()) {
return Status::Invalid("values was null");
}
const Status values_valid = ValidateArray(*array.values());
if (!values_valid.ok()) {
return Status::Invalid("values array invalid: ", values_valid.ToString());
}

const int32_t last_offset = array.value_offset(array.length());
if (array.values()->length() != last_offset) {
return Status::Invalid("Final offset invariant not equal to values length: ",
last_offset, "!=", array.values()->length());
}
if (array.keys()->length() != last_offset) {
return Status::Invalid("Final offset invariant not equal to keys length: ",
last_offset, "!=", array.keys()->length());
}

return ValidateOffsets(array);
}

Status Visit(const FixedSizeListArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
Expand Down
39 changes: 38 additions & 1 deletion cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ class ARROW_EXPORT ListArray : public Array {
static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out);

const ListType* list_type() const;
const ListType* list_type() const { return list_type_; }

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;
Expand All @@ -521,13 +521,50 @@ class ARROW_EXPORT ListArray : public Array {
}

protected:
// this constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
const int32_t* raw_value_offsets_;

private:
const ListType* list_type_;
std::shared_ptr<Array> values_;
};

// ----------------------------------------------------------------------
// MapArray

/// Concrete Array class for map data
///
/// NB: "value" in this context refers to a pair of a key and the correspondint item
class ARROW_EXPORT MapArray : public ListArray {
public:
using TypeClass = MapType;

explicit MapArray(const std::shared_ptr<ArrayData>& data);

MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

const MapType* map_type() const { return map_type_; }

/// \brief Return array object containing all map keys
std::shared_ptr<Array> keys() const { return keys_; }

/// \brief Return array object containing all mapped items
std::shared_ptr<Array> items() const { return items_; }

protected:
void SetData(const std::shared_ptr<ArrayData>& data);

private:
const MapType* map_type_;
std::shared_ptr<Array> keys_, items_;
};

// ----------------------------------------------------------------------
// FixedSizeListArray

Expand Down
Loading

0 comments on commit dede1e6

Please sign in to comment.