You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/24 03:41:02 UTC

[arrow] branch master updated: ARROW-5997: [Java] Support dictionary encoding for Union type

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2ee55bc  ARROW-5997: [Java] Support dictionary encoding for Union type
2ee55bc is described below

commit 2ee55bcaca98def0709189fc05674f1f23379d6c
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Tue Jul 23 20:40:26 2019 -0700

    ARROW-5997: [Java] Support dictionary encoding for Union type
    
    Related to [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997).
    
    Now only Union type is not supported in dictionary encoding.
    In the last several weeks, we did some refactor for encoding and now it's time to support Union type.
    
    Author: tianchen <ni...@alibaba-inc.com>
    
    Closes #4917 from tianchen92/ARROW-5997 and squashes the following commits:
    
    577b73ce5 <tianchen> fix
    e8a58896f <tianchen> ARROW-5997:  Support dictionary encoding for Union type
---
 .../arrow/vector/dictionary/DictionaryEncoder.java | 11 +--
 .../apache/arrow/vector/TestDictionaryVector.java  | 78 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 9b16bb1..accf2f9 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -20,7 +20,6 @@ package org.apache.arrow.vector.dictionary;
 import org.apache.arrow.vector.BaseIntVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.types.Types.MinorType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.util.TransferPair;
@@ -42,9 +41,7 @@ public class DictionaryEncoder {
    * @return dictionary encoded vector
    */
   public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
-    validateType(vector.getMinorType());
-    // load dictionary indices into a hashmap for lookup
-
+    // load dictionary indices into a hash table for lookup
     DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector());
     for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
       hashTable.put(i);
@@ -114,10 +111,4 @@ public class DictionaryEncoder {
     decoded.setValueCount(count);
     return decoded;
   }
-
-  private static void validateType(MinorType type) {
-    if (type == MinorType.UNION) {
-      throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type);
-    }
-  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index 0d2bce9..e0bd218 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -28,10 +28,14 @@ import java.util.Arrays;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.complex.impl.NullableStructWriter;
 import org.apache.arrow.vector.complex.impl.UnionListWriter;
 import org.apache.arrow.vector.dictionary.Dictionary;
 import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.holders.NullableIntHolder;
+import org.apache.arrow.vector.holders.NullableUInt4Holder;
+import org.apache.arrow.vector.types.Types;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
 import org.apache.arrow.vector.types.pojo.FieldType;
@@ -328,4 +332,78 @@ public class TestDictionaryVector {
       }
     }
   }
+
+  @Test
+  public void testEncodeUnion() {
+    // Create a new value vector
+    try (final UnionVector vector = new UnionVector("vector", allocator, null);
+        final UnionVector dictionaryVector = new UnionVector("dict", allocator, null);) {
+
+      final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder();
+      uintHolder1.value = 10;
+      uintHolder1.isSet = 1;
+
+      final NullableIntHolder intHolder1 = new NullableIntHolder();
+      intHolder1.value = 10;
+      intHolder1.isSet = 1;
+
+      final NullableIntHolder intHolder2 = new NullableIntHolder();
+      intHolder2.value = 20;
+      intHolder2.isSet = 1;
+
+      //write data
+      vector.setType(0, Types.MinorType.UINT4);
+      vector.setSafe(0, uintHolder1);
+
+      vector.setType(1, Types.MinorType.INT);
+      vector.setSafe(1, intHolder1);
+
+      vector.setType(2, Types.MinorType.INT);
+      vector.setSafe(2, intHolder1);
+
+      vector.setType(3, Types.MinorType.INT);
+      vector.setSafe(3, intHolder2);
+
+      vector.setType(4, Types.MinorType.INT);
+      vector.setSafe(4, intHolder2);
+
+      vector.setValueCount(5);
+
+      //write dictionary
+      dictionaryVector.setType(0, Types.MinorType.UINT4);
+      dictionaryVector.setSafe(0, uintHolder1);
+
+      dictionaryVector.setType(1, Types.MinorType.INT);
+      dictionaryVector.setSafe(1, intHolder1);
+
+      dictionaryVector.setType(2, Types.MinorType.INT);
+      dictionaryVector.setSafe(2, intHolder2);
+
+      dictionaryVector.setValueCount(3);
+
+      Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+      try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
+        // verify indices
+        assertEquals(IntVector.class, encoded.getClass());
+
+        IntVector index = ((IntVector)encoded);
+        assertEquals(5, index.getValueCount());
+        assertEquals(0, index.get(0));
+        assertEquals(1, index.get(1));
+        assertEquals(1, index.get(2));
+        assertEquals(2, index.get(3));
+        assertEquals(2, index.get(4));
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getValueCount(), decoded.getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertEquals(vector.getObject(i), decoded.getObject(i));
+          }
+        }
+      }
+    }
+  }
 }