Skip to content

Commit

Permalink
ARROW-5155: [GLib][Ruby] Add support for building union arrays from d…
Browse files Browse the repository at this point in the history
…ata type

This is separated from apache#3723.
This should be merged after apache#3723.

Author: Kenta Murata <[email protected]>
Author: Kouhei Sutou <[email protected]>

Closes apache#4127 from mrkn/glib_ruby_make_union_array_with_field_names and squashes the following commits:

e625556 <Kouhei Sutou> Fix test data
f82ac3d <Kenta Murata>  Fix test cases
d550dc9 <Kenta Murata>  Fix comment
f1bfa07 <Kenta Murata>  Stop copying a type_code vector
606a04c <Kenta Murata>  Use new constructors of union arrays
5ad5572 <Kenta Murata>  Add garrow_dense_union_array_new_data_type
c8793d5 <Kenta Murata>  Add garrow_sparse_union_array_new_data_type
  • Loading branch information
mrkn authored and kou committed Apr 25, 2019
1 parent 948379f commit ecfb807
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 62 deletions.
97 changes: 97 additions & 0 deletions c_glib/arrow-glib/composite-array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,53 @@ garrow_sparse_union_array_new(GArrowInt8Array *type_ids,
}
}

/**
* garrow_sparse_union_array_new_data_type:
* @data_type: The data type for the sparse array.
* @type_ids: The field type IDs for each value as #GArrowInt8Array.
* @fields: (element-type GArrowArray): The arrays for each field
* as #GList of #GArrowArray.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable): A newly created #GArrowSparseUnionArray
* or %NULL on error.
*
* Since: 0.14.0
*/
GArrowSparseUnionArray *
garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GList *fields,
GError **error)
{
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
auto arrow_union_data_type =
std::static_pointer_cast<arrow::UnionType>(arrow_data_type);
std::vector<std::string> arrow_field_names;
for (const auto &arrow_field : arrow_union_data_type->children()) {
arrow_field_names.push_back(arrow_field->name());
}
auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids));
std::vector<std::shared_ptr<arrow::Array>> arrow_fields;
for (auto node = fields; node; node = node->next) {
auto *field = GARROW_ARRAY(node->data);
arrow_fields.push_back(garrow_array_get_raw(field));
}
std::shared_ptr<arrow::Array> arrow_union_array;
auto status = arrow::UnionArray::MakeSparse(*arrow_type_ids,
arrow_fields,
arrow_field_names,
arrow_union_data_type->type_codes(),
&arrow_union_array);
if (garrow_error_check(error,
status,
"[sparse-union-array][new][data-type]")) {
return GARROW_SPARSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array));
} else {
return NULL;
}
}


G_DEFINE_TYPE(GArrowDenseUnionArray,
garrow_dense_union_array,
Expand Down Expand Up @@ -420,6 +467,56 @@ garrow_dense_union_array_new(GArrowInt8Array *type_ids,
}
}

/**
* garrow_dense_union_array_new_data_type:
* @data_type: The data type for the dense array.
* @type_ids: The field type IDs for each value as #GArrowInt8Array.
* @value_offsets: The value offsets for each value as #GArrowInt32Array.
* Each offset is counted for each type.
* @fields: (element-type GArrowArray): The arrays for each field
* as #GList of #GArrowArray.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable): A newly created #GArrowSparseUnionArray
* or %NULL on error.
*
* Since: 0.14.0
*/
GArrowDenseUnionArray *
garrow_dense_union_array_new_data_type(GArrowDenseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error)
{
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
auto arrow_union_data_type =
std::static_pointer_cast<arrow::UnionType>(arrow_data_type);
std::vector<std::string> arrow_field_names;
for (const auto &arrow_field : arrow_union_data_type->children()) {
arrow_field_names.push_back(arrow_field->name());
}
auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids));
auto arrow_value_offsets = garrow_array_get_raw(GARROW_ARRAY(value_offsets));
std::vector<std::shared_ptr<arrow::Array>> arrow_fields;
for (auto node = fields; node; node = node->next) {
auto *field = GARROW_ARRAY(node->data);
arrow_fields.push_back(garrow_array_get_raw(field));
}
std::shared_ptr<arrow::Array> arrow_union_array;
auto status = arrow::UnionArray::MakeDense(*arrow_type_ids,
*arrow_value_offsets,
arrow_fields,
arrow_field_names,
arrow_union_data_type->type_codes(),
&arrow_union_array);
if (garrow_error_check(error, status, "[dense-union-array][new][data-type]")) {
return GARROW_DENSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array));
} else {
return NULL;
}
}


G_DEFINE_TYPE(GArrowDictionaryArray,
garrow_dictionary_array,
Expand Down
11 changes: 11 additions & 0 deletions c_glib/arrow-glib/composite-array.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ GArrowSparseUnionArray *
garrow_sparse_union_array_new(GArrowInt8Array *type_ids,
GList *fields,
GError **error);
GArrowSparseUnionArray *
garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GList *fields,
GError **error);


#define GARROW_TYPE_DENSE_UNION_ARRAY (garrow_dense_union_array_get_type())
Expand All @@ -126,6 +131,12 @@ garrow_dense_union_array_new(GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error);
GArrowDenseUnionArray *
garrow_dense_union_array_new_data_type(GArrowDenseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error);


#define GARROW_TYPE_DICTIONARY_ARRAY (garrow_dictionary_array_get_type())
Expand Down
90 changes: 64 additions & 26 deletions c_glib/test/test-dense-union-array.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,71 @@
class TestDenseUnionArray < Test::Unit::TestCase
include Helper::Buildable

def setup
type_ids = build_int8_array([0, 1, nil, 1, 1])
value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
]
@array = Arrow::DenseUnionArray.new(type_ids, value_offsets, fields)
end
sub_test_case(".new") do
sub_test_case("default") do
def setup
type_ids = build_int8_array([0, 1, nil, 1, 1])
value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
]
@array = Arrow::DenseUnionArray.new(type_ids, value_offsets, fields)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::DenseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end
def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::DenseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end

sub_test_case("DataType") do
def setup
data_type_fields = [
Arrow::Field.new("number", Arrow::Int16DataType.new),
Arrow::Field.new("text", Arrow::StringDataType.new),
]
type_codes = [11, 13]
@data_type = Arrow::DenseUnionDataType.new(data_type_fields, type_codes)
type_ids = build_int8_array([11, 13, nil, 13, 13])
value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"])
]
@array = Arrow::DenseUnionArray.new(@data_type, type_ids, value_offsets, fields)
end

def test_value_data_type
assert_equal(@data_type,
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end
end
end
87 changes: 62 additions & 25 deletions c_glib/test/test-sparse-union-array.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,69 @@
class TestSparseUnionArray < Test::Unit::TestCase
include Helper::Buildable

def setup
type_ids = build_int8_array([0, 1, nil, 1, 0])
fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(type_ids, fields)
end
sub_test_case(".new") do
sub_test_case("default") do
def setup
type_ids = build_int8_array([0, 1, nil, 1, 0])
fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(type_ids, fields)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::SparseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end
def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::SparseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end

sub_test_case("DataType") do
def setup
data_type_fields = [
Arrow::Field.new("number", Arrow::Int16DataType.new),
Arrow::Field.new("text", Arrow::StringDataType.new),
]
type_codes = [11, 13]
@data_type = Arrow::SparseUnionDataType.new(data_type_fields, type_codes)
type_ids = build_int8_array([11, 13, nil, 13, 11])
fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(@data_type, type_ids, fields)
end

def test_value_data_type
assert_equal(@data_type,
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,8 @@ def build_record_batch(type, records)
offsets << (type_ids.count(type_id) - 1)
end
end
# TODO
# union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type,
# Arrow::Int8Array.new(type_ids),
# Arrow::Int32Array.new(offsets),
# arrays)
union_array = Arrow::DenseUnionArray.new(Arrow::Int8Array.new(type_ids),
union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type,
Arrow::Int8Array.new(type_ids),
Arrow::Int32Array.new(offsets),
arrays)
schema = Arrow::Schema.new(column: union_array.value_data_type)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,8 @@ def build_record_batch(type, records)
type_ids << type_codes[1]
end
end
# TODO
# union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type,
# Arrow::Int8Array.new(type_ids),
# arrays)
union_array = Arrow::SparseUnionArray.new(Arrow::Int8Array.new(type_ids),
union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type,
Arrow::Int8Array.new(type_ids),
arrays)
schema = Arrow::Schema.new(column: union_array.value_data_type)
Arrow::RecordBatch.new(schema,
Expand Down

0 comments on commit ecfb807

Please sign in to comment.