Skip to content

Commit

Permalink
ARROW-2515 [Python] Add DictionaryValue class, fixing bugs with neste…
Browse files Browse the repository at this point in the history
…d dictionaries

This introduces a scalar value class DictionaryValue, which fixes a couple bugs involving dictionaries nested inside of ListArrays or inside of other DictionaryArrays. This also includes a new test, which failed previous to this commit but now passes. See https://issues.apache.org/jira/browse/ARROW-2515.

This is my first time contributing, so feedback would be most welcome.

Author: Brent Kerby <[email protected]>

Closes apache#1954 from blkerby/DictionaryValue and squashes the following commits:

1e06963 <Brent Kerby> ARROW-2515:  Add DictionaryValue class, fixing bugs with nested dictionaries
  • Loading branch information
blkerby authored and pitrou committed Apr 30, 2018
1 parent 3f5819a commit e8d45eb
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 9 deletions.
9 changes: 0 additions & 9 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -918,15 +918,6 @@ cdef class BinaryArray(Array):

cdef class DictionaryArray(Array):

cdef getitem(self, int64_t i):
cdef Array dictionary = self.dictionary
index = self.indices[i]
if index is NA:
return index
else:
return box_scalar(dictionary.type, dictionary.sp_array,
index.as_py())

def dictionary_encode(self):
return self

Expand Down
24 changes: 24 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,29 @@ cdef class StructValue(ArrayValue):
zip(child_names, wrapped_arrays)
}

cdef class DictionaryValue(ArrayValue):

def as_py(self):
return self.dictionary_value.as_py()

property index_value:

def __get__(self):
cdef CDictionaryArray* darr

darr = <CDictionaryArray*>(self.sp_array.get())
indices = pyarrow_wrap_array(darr.indices())
return indices[self.index]

property dictionary_value:

def __get__(self):
cdef CDictionaryArray* darr

darr = <CDictionaryArray*>(self.sp_array.get())
dictionary = pyarrow_wrap_array(darr.dictionary())
return dictionary[self.index_value.as_py()]


cdef dict _scalar_classes = {
_Type_BOOL: BooleanValue,
Expand All @@ -428,6 +451,7 @@ cdef dict _scalar_classes = {
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
_Type_DECIMAL: DecimalValue,
_Type_STRUCT: StructValue,
_Type_DICTIONARY: DictionaryValue,
}


Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,3 +841,13 @@ def test_struct_array_flatten():
xs, ys = a[1:].flatten()
assert xs.to_pylist() == [None, None]
assert ys.to_pylist() == [None, 2.5]


def test_nested_dictionary_array():
dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
list_arr = pa.ListArray.from_arrays([0, 2, 3], dict_arr)
assert list_arr.to_pylist() == [['a', 'b'], ['a']]

dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
3 changes: 3 additions & 0 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def test_timestamp(self):

def test_dictionary(self):
colors = ['red', 'green', 'blue']
colors_dict = {'red': 0, 'green': 1, 'blue': 2}
values = pd.Series(colors * 4)

categorical = pd.Categorical(values, categories=colors)
Expand All @@ -188,6 +189,8 @@ def test_dictionary(self):
categorical.categories)
for i, c in enumerate(values):
assert v[i].as_py() == c
assert v[i].dictionary_value == c
assert v[i].index_value == colors_dict[c]

def test_int_hash(self):
# ARROW-640
Expand Down

0 comments on commit e8d45eb

Please sign in to comment.