You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/12 07:27:10 UTC

[arrow] branch master updated: ARROW-5883: [Java] Support dictionary encoding for List and Struct type

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ab7ff65  ARROW-5883: [Java] Support dictionary encoding for List and Struct type
ab7ff65 is described below

commit ab7ff651d4cec0e8a8b0e7a4cf2ceb90f97a52dd
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Fri Jul 12 00:25:54 2019 -0700

    ARROW-5883: [Java] Support dictionary encoding for List and Struct type
    
    As described in http://arrow.apache.org/docs/format/Layout.html#dictionary-encoding, List type encoding should be supported.
    
    Now ListVector getObject returns a ArrayList implementation, and its equals and hashCode are already overwritten, so it could be directly supported to be hashMap key in DictionaryEncoder. Since we won't change Dictionary data during encoding/decoding process, use mutable key seems dose't matter.
    StructVector is similar to ListVector.
    
    Author: tianchen <ni...@alibaba-inc.com>
    
    Closes #4830 from tianchen92/ARROW-5883 and squashes the following commits:
    
    ced12cc01 <tianchen> add helper method
    03731547e <tianchen> support struct type
    497753179 <tianchen> ARROW-5883:  Support Dictionary Encoding for List type
---
 .../arrow/vector/dictionary/DictionaryEncoder.java |   3 +-
 .../apache/arrow/vector/TestDictionaryVector.java  | 136 +++++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index ccd4b55..a28ea5b 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -117,8 +117,7 @@ public class DictionaryEncoder {
   private static void validateType(MinorType type) {
     // byte arrays don't work as keys in our dictionary map - we could wrap them with something to
     // implement equals and hashcode if we want that functionality
-    if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST ||
-        type == MinorType.STRUCT || type == MinorType.UNION) {
+    if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.UNION) {
       throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type);
     }
   }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index a04326b..36e763c 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -23,9 +23,15 @@ import static org.junit.Assert.assertEquals;
 import java.nio.charset.StandardCharsets;
 
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.complex.impl.NullableStructWriter;
+import org.apache.arrow.vector.complex.impl.UnionListWriter;
 import org.apache.arrow.vector.dictionary.Dictionary;
 import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+import org.apache.arrow.vector.types.pojo.FieldType;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -142,4 +148,134 @@ public class TestDictionaryVector {
       }
     }
   }
+
+  private void writeListVector(UnionListWriter writer, int[] values) {
+    writer.startList();
+    for (int v: values) {
+      writer.integer().writeInt(v);
+    }
+    writer.endList();
+  }
+
+  @Test
+  public void testEncodeList() {
+    // Create a new value vector
+    try (final ListVector vector = ListVector.empty("vector", allocator);
+        final ListVector dictionaryVector = ListVector.empty("dict", allocator);) {
+
+      UnionListWriter writer = vector.getWriter();
+      writer.allocate();
+
+      //set some values
+      writeListVector(writer, new int[]{10, 20});
+      writeListVector(writer, new int[]{10, 20});
+      writeListVector(writer, new int[]{10, 20});
+      writeListVector(writer, new int[]{30, 40, 50});
+      writeListVector(writer, new int[]{30, 40, 50});
+      writeListVector(writer, new int[]{10, 20});
+
+      writer.setValueCount(6);
+
+      UnionListWriter dictWriter = dictionaryVector.getWriter();
+      dictWriter.allocate();
+
+      writeListVector(dictWriter, new int[]{10, 20});
+      writeListVector(dictWriter, new int[]{30, 40, 50});
+
+      dictWriter.setValueCount(2);
+
+      Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+      try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
+        // verify indices
+        assertEquals(IntVector.class, encoded.getClass());
+
+        IntVector index = ((IntVector)encoded);
+        assertEquals(6, index.getValueCount());
+        assertEquals(0, index.get(0));
+        assertEquals(0, index.get(1));
+        assertEquals(0, index.get(2));
+        assertEquals(1, index.get(3));
+        assertEquals(1, index.get(4));
+        assertEquals(0, index.get(5));
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getValueCount(), decoded.getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertEquals(vector.getObject(i), decoded.getObject(i));
+          }
+        }
+      }
+    }
+  }
+
+  private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
+    writer.start();
+    writer.integer("f0").writeInt(value1);
+    writer.bigInt("f1").writeBigInt(value2);
+    writer.end();
+  }
+
+  @Test
+  public void testEncodeStruct() {
+    // Create a new value vector
+    try (final StructVector vector = StructVector.empty("vector", allocator);
+        final StructVector dictionaryVector = StructVector.empty("dict", allocator);) {
+      vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      vector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
+      dictionaryVector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      dictionaryVector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
+
+      NullableStructWriter writer = vector.getWriter();
+      writer.allocate();
+
+      writeStructVector(writer, 1, 10L);
+      writeStructVector(writer, 1, 10L);
+      writeStructVector(writer, 1, 10L);
+      writeStructVector(writer, 2, 20L);
+      writeStructVector(writer, 2, 20L);
+      writeStructVector(writer, 2, 20L);
+      writeStructVector(writer, 1, 10L);
+
+      writer.setValueCount(7);
+
+      NullableStructWriter dictWriter = dictionaryVector.getWriter();
+      dictWriter.allocate();
+
+      writeStructVector(dictWriter, 1, 10L);
+      writeStructVector(dictWriter, 2, 20L);
+
+
+      dictionaryVector.setValueCount(2);
+
+      Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+      try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
+        // verify indices
+        assertEquals(IntVector.class, encoded.getClass());
+
+        IntVector index = ((IntVector)encoded);
+        assertEquals(7, index.getValueCount());
+        assertEquals(0, index.get(0));
+        assertEquals(0, index.get(1));
+        assertEquals(0, index.get(2));
+        assertEquals(1, index.get(3));
+        assertEquals(1, index.get(4));
+        assertEquals(1, index.get(5));
+        assertEquals(0, index.get(6));
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getValueCount(), decoded.getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertEquals(vector.getObject(i), decoded.getObject(i));
+          }
+        }
+      }
+    }
+  }
+
 }