Skip to content

Commit

Permalink
ARROW-6672: [Java] Extract a common interface for dictionary builders
Browse files Browse the repository at this point in the history
We need a common interface for dictionary builders to support more sophisticated scenarios, like collecting dictionary statistics.

Closes apache#5486 from liyafan82/fly_0923_build and squashes the following commits:

04e8e65 <liyafan82>  Extract a common interface for dictionary builders

Authored-by: liyafan82 <[email protected]>
Signed-off-by: Micah Kornfield <[email protected]>
  • Loading branch information
liyafan82 authored and emkornfield committed Oct 24, 2019
1 parent 776165c commit 7fc4a37
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 53 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.algorithm.dictionary;

import org.apache.arrow.vector.ValueVector;

/**
* A dictionary builder is intended for the scenario frequently encountered in practice:
* the dictionary is not known a priori, so it is generated dynamically.
* In particular, when a new value arrives, it is tested to check if it is already
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
* <p>
* The dictionary builder is intended to build a single dictionary.
* So it cannot be used for different dictionaries.
* </p>
* <p>Below gives the sample code for using the dictionary builder
* <pre>{@code
* DictionaryBuilder dictionaryBuilder = ...
* ...
* dictionaryBuild.addValue(newValue);
* ...
* }</pre>
* </p>
* <p>
* With the above code, the dictionary vector will be populated,
* and it can be retrieved by the {@link DictionaryBuilder#getDictionary()} method.
* After that, dictionary encoding can proceed with the populated dictionary..
* </p>
*
* @param <V> the dictionary vector type.
*/
public interface DictionaryBuilder<V extends ValueVector> {

/**
* Try to add all values from the target vector to the dictionary.
*
* @param targetVector the target vector containing values to probe.
* @return the number of values actually added to the dictionary.
*/
int addValues(V targetVector);

/**
* Try to add an element from the target vector to the dictionary.
*
* @param targetVector the target vector containing new element.
* @param targetIndex the index of the new element in the target vector.
* @return the index of the new element in the dictionary.
*/
int addValue(V targetVector, int targetIndex);

/**
* Gets the dictionary built.
*
* @return the dictionary.
*/
V getDictionary();
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,37 +25,13 @@
import org.apache.arrow.vector.ElementAddressableVector;

/**
* A dictionary builder is intended for the scenario frequently encountered in practice:
* the dictionary is not known a priori, so it is generated dynamically.
* In particular, when a new value arrives, it is tested to check if it is already
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
*
* <p>
* This class builds the dictionary based on a hash table.
* Each add operation can be finished in O(1) time,
* where n is the current dictionary size.
* </p>
* <p>
* The dictionary builder is intended to build a single dictionary.
* So it cannot be used for different dictionaries.
* </p>
* <p>Below gives the sample code for using the dictionary builder
* <pre>{@code
* HashTableBasedDictionaryBuilder dictionaryBuilder = ...
* ...
* dictionaryBuild.addValue(newValue);
* ...
* }</pre>
* </p>
* <p>
* With the above code, the dictionary vector will be populated,
* and it can be retrieved by the {@link HashTableBasedDictionaryBuilder#getDictionary()} method.
* After that, dictionary encoding can proceed with the populated dictionary encoder.
* </p>
*
* @param <V> the dictionary vector type.
*/
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector> {
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector> implements DictionaryBuilder<V> {

/**
* The dictionary to be built.
Expand Down Expand Up @@ -121,6 +97,7 @@ public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull, ArrowBu
*
* @return the dictionary.
*/
@Override
public V getDictionary() {
return dictionary;
}
Expand All @@ -131,6 +108,7 @@ public V getDictionary() {
* @param targetVector the target vector containing values to probe.
* @return the number of values actually added to the dictionary.
*/
@Override
public int addValues(V targetVector) {
int oldDictSize = dictionary.getValueCount();
for (int i = 0; i < targetVector.getValueCount(); i++) {
Expand All @@ -150,6 +128,7 @@ public int addValues(V targetVector) {
* @param targetIndex the index of the new element in the target vector.
* @return the index of the new element in the dictionary.
*/
@Override
public int addValue(V targetVector, int targetIndex) {
targetVector.getDataPointer(targetIndex, nextPointer);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,13 @@
import org.apache.arrow.vector.ValueVector;

/**
* A dictionary builder is intended for the scenario frequently encountered in practice:
* the dictionary is not known a priori, so it is generated dynamically.
* In particular, when a new value arrives, it is tested to check if it is already
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
* This class builds the dictionary based on a binary search tree.
* Each add operation can be finished in O(log(n)) time,
* where n is the current dictionary size.
*
* <p>
* This class builds the dictionary based on a binary search tree.
* Each add operation can be finished in O(log(n)) time,
* where n is the current dictionary size.
* </p>
* <p>
* The dictionary builder is intended to build a single dictionary.
* So it cannot be used for different dictionaries.
* </p>
* <p>Below gives the sample code for using the dictionary builder
* <pre>{@code
* SearchTreeBasedDictionaryBuilder dictionaryBuilder = ...
* ...
* dictionaryBuild.addValue(newValue);
* ...
* }</pre>
* </p>
* <p>
* With the above code, the dictionary vector will be populated,
* and it can be retrieved by the {@link SearchTreeBasedDictionaryBuilder#getDictionary()} method.
* After that, dictionary encoding can proceed with the populated dictionary.
* </p>
* @param <V> the dictionary vector type.
*/
public class SearchTreeBasedDictionaryBuilder<V extends ValueVector> {
public class SearchTreeBasedDictionaryBuilder<V extends ValueVector> implements DictionaryBuilder<V> {

/**
* The dictionary to be built.
Expand Down Expand Up @@ -106,6 +83,7 @@ public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> c
* {@link SearchTreeBasedDictionaryBuilder#populateSortedDictionary(ValueVector)}.
* @return the dictionary.
*/
@Override
public V getDictionary() {
return dictionary;
}
Expand All @@ -115,6 +93,7 @@ public V getDictionary() {
* @param targetVector the target vector containing values to probe.
* @return the number of values actually added to the dictionary.
*/
@Override
public int addValues(V targetVector) {
int oldDictSize = dictionary.getValueCount();
for (int i = 0; i < targetVector.getValueCount(); i++) {
Expand All @@ -132,6 +111,7 @@ public int addValues(V targetVector) {
* @param targetIndex the index of the new element in the target vector.
* @return the index of the new element in the dictionary.
*/
@Override
public int addValue(V targetVector, int targetIndex) {
// first copy the value to the end of the dictionary
int dictSize = dictionary.getValueCount();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
/**
* Test cases for {@link HashTableBasedDictionaryBuilder}.
*/
public class TestHashTableBasedDictionaryEncoder {
public class TestHashTableBasedDictionaryBuilder {

private BufferAllocator allocator;

Expand Down

0 comments on commit 7fc4a37

Please sign in to comment.