You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/24 03:41:02 UTC
[arrow] branch master updated: ARROW-5997: [Java] Support
dictionary encoding for Union type
This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2ee55bc ARROW-5997: [Java] Support dictionary encoding for Union type
2ee55bc is described below
commit 2ee55bcaca98def0709189fc05674f1f23379d6c
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Tue Jul 23 20:40:26 2019 -0700
ARROW-5997: [Java] Support dictionary encoding for Union type
Related to [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997).
Now only Union type is not supported in dictionary encoding.
In the last several weeks, we did some refactor for encoding and now it's time to support Union type.
Author: tianchen <ni...@alibaba-inc.com>
Closes #4917 from tianchen92/ARROW-5997 and squashes the following commits:
577b73ce5 <tianchen> fix
e8a58896f <tianchen> ARROW-5997: Support dictionary encoding for Union type
---
.../arrow/vector/dictionary/DictionaryEncoder.java | 11 +--
.../apache/arrow/vector/TestDictionaryVector.java | 78 ++++++++++++++++++++++
2 files changed, 79 insertions(+), 10 deletions(-)
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 9b16bb1..accf2f9 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -20,7 +20,6 @@ package org.apache.arrow.vector.dictionary;
import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.types.Types.MinorType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.TransferPair;
@@ -42,9 +41,7 @@ public class DictionaryEncoder {
* @return dictionary encoded vector
*/
public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
- validateType(vector.getMinorType());
- // load dictionary indices into a hashmap for lookup
-
+ // load dictionary indices into a hash table for lookup
DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector());
for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
hashTable.put(i);
@@ -114,10 +111,4 @@ public class DictionaryEncoder {
decoded.setValueCount(count);
return decoded;
}
-
- private static void validateType(MinorType type) {
- if (type == MinorType.UNION) {
- throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type);
- }
- }
}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index 0d2bce9..e0bd218 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -28,10 +28,14 @@ import java.util.Arrays;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.complex.ListVector;
import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.complex.UnionVector;
import org.apache.arrow.vector.complex.impl.NullableStructWriter;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.dictionary.Dictionary;
import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.holders.NullableIntHolder;
+import org.apache.arrow.vector.holders.NullableUInt4Holder;
+import org.apache.arrow.vector.types.Types;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
import org.apache.arrow.vector.types.pojo.FieldType;
@@ -328,4 +332,78 @@ public class TestDictionaryVector {
}
}
}
+
+ @Test
+ public void testEncodeUnion() {
+ // Create a new value vector
+ try (final UnionVector vector = new UnionVector("vector", allocator, null);
+ final UnionVector dictionaryVector = new UnionVector("dict", allocator, null);) {
+
+ final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder();
+ uintHolder1.value = 10;
+ uintHolder1.isSet = 1;
+
+ final NullableIntHolder intHolder1 = new NullableIntHolder();
+ intHolder1.value = 10;
+ intHolder1.isSet = 1;
+
+ final NullableIntHolder intHolder2 = new NullableIntHolder();
+ intHolder2.value = 20;
+ intHolder2.isSet = 1;
+
+ //write data
+ vector.setType(0, Types.MinorType.UINT4);
+ vector.setSafe(0, uintHolder1);
+
+ vector.setType(1, Types.MinorType.INT);
+ vector.setSafe(1, intHolder1);
+
+ vector.setType(2, Types.MinorType.INT);
+ vector.setSafe(2, intHolder1);
+
+ vector.setType(3, Types.MinorType.INT);
+ vector.setSafe(3, intHolder2);
+
+ vector.setType(4, Types.MinorType.INT);
+ vector.setSafe(4, intHolder2);
+
+ vector.setValueCount(5);
+
+ //write dictionary
+ dictionaryVector.setType(0, Types.MinorType.UINT4);
+ dictionaryVector.setSafe(0, uintHolder1);
+
+ dictionaryVector.setType(1, Types.MinorType.INT);
+ dictionaryVector.setSafe(1, intHolder1);
+
+ dictionaryVector.setType(2, Types.MinorType.INT);
+ dictionaryVector.setSafe(2, intHolder2);
+
+ dictionaryVector.setValueCount(3);
+
+ Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+ try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
+ // verify indices
+ assertEquals(IntVector.class, encoded.getClass());
+
+ IntVector index = ((IntVector)encoded);
+ assertEquals(5, index.getValueCount());
+ assertEquals(0, index.get(0));
+ assertEquals(1, index.get(1));
+ assertEquals(1, index.get(2));
+ assertEquals(2, index.get(3));
+ assertEquals(2, index.get(4));
+
+ // now run through the decoder and verify we get the original back
+ try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+ assertEquals(vector.getClass(), decoded.getClass());
+ assertEquals(vector.getValueCount(), decoded.getValueCount());
+ for (int i = 0; i < 5; i++) {
+ assertEquals(vector.getObject(i), decoded.getObject(i));
+ }
+ }
+ }
+ }
+ }
}