You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/12 07:27:10 UTC
[arrow] branch master updated: ARROW-5883: [Java] Support
dictionary encoding for List and Struct type
This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ab7ff65 ARROW-5883: [Java] Support dictionary encoding for List and Struct type
ab7ff65 is described below
commit ab7ff651d4cec0e8a8b0e7a4cf2ceb90f97a52dd
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Fri Jul 12 00:25:54 2019 -0700
ARROW-5883: [Java] Support dictionary encoding for List and Struct type
As described in http://arrow.apache.org/docs/format/Layout.html#dictionary-encoding, List type encoding should be supported.
Now ListVector getObject returns a ArrayList implementation, and its equals and hashCode are already overwritten, so it could be directly supported to be hashMap key in DictionaryEncoder. Since we won't change Dictionary data during encoding/decoding process, use mutable key seems dose't matter.
StructVector is similar to ListVector.
Author: tianchen <ni...@alibaba-inc.com>
Closes #4830 from tianchen92/ARROW-5883 and squashes the following commits:
ced12cc01 <tianchen> add helper method
03731547e <tianchen> support struct type
497753179 <tianchen> ARROW-5883: Support Dictionary Encoding for List type
---
.../arrow/vector/dictionary/DictionaryEncoder.java | 3 +-
.../apache/arrow/vector/TestDictionaryVector.java | 136 +++++++++++++++++++++
2 files changed, 137 insertions(+), 2 deletions(-)
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index ccd4b55..a28ea5b 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -117,8 +117,7 @@ public class DictionaryEncoder {
private static void validateType(MinorType type) {
// byte arrays don't work as keys in our dictionary map - we could wrap them with something to
// implement equals and hashcode if we want that functionality
- if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.LIST ||
- type == MinorType.STRUCT || type == MinorType.UNION) {
+ if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.UNION) {
throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type);
}
}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index a04326b..36e763c 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -23,9 +23,15 @@ import static org.junit.Assert.assertEquals;
import java.nio.charset.StandardCharsets;
import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.complex.impl.NullableStructWriter;
+import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.dictionary.Dictionary;
import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+import org.apache.arrow.vector.types.pojo.FieldType;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -142,4 +148,134 @@ public class TestDictionaryVector {
}
}
}
+
+ private void writeListVector(UnionListWriter writer, int[] values) {
+ writer.startList();
+ for (int v: values) {
+ writer.integer().writeInt(v);
+ }
+ writer.endList();
+ }
+
+ @Test
+ public void testEncodeList() {
+ // Create a new value vector
+ try (final ListVector vector = ListVector.empty("vector", allocator);
+ final ListVector dictionaryVector = ListVector.empty("dict", allocator);) {
+
+ UnionListWriter writer = vector.getWriter();
+ writer.allocate();
+
+ //set some values
+ writeListVector(writer, new int[]{10, 20});
+ writeListVector(writer, new int[]{10, 20});
+ writeListVector(writer, new int[]{10, 20});
+ writeListVector(writer, new int[]{30, 40, 50});
+ writeListVector(writer, new int[]{30, 40, 50});
+ writeListVector(writer, new int[]{10, 20});
+
+ writer.setValueCount(6);
+
+ UnionListWriter dictWriter = dictionaryVector.getWriter();
+ dictWriter.allocate();
+
+ writeListVector(dictWriter, new int[]{10, 20});
+ writeListVector(dictWriter, new int[]{30, 40, 50});
+
+ dictWriter.setValueCount(2);
+
+ Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+ try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
+ // verify indices
+ assertEquals(IntVector.class, encoded.getClass());
+
+ IntVector index = ((IntVector)encoded);
+ assertEquals(6, index.getValueCount());
+ assertEquals(0, index.get(0));
+ assertEquals(0, index.get(1));
+ assertEquals(0, index.get(2));
+ assertEquals(1, index.get(3));
+ assertEquals(1, index.get(4));
+ assertEquals(0, index.get(5));
+
+ // now run through the decoder and verify we get the original back
+ try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+ assertEquals(vector.getClass(), decoded.getClass());
+ assertEquals(vector.getValueCount(), decoded.getValueCount());
+ for (int i = 0; i < 5; i++) {
+ assertEquals(vector.getObject(i), decoded.getObject(i));
+ }
+ }
+ }
+ }
+ }
+
+ private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
+ writer.start();
+ writer.integer("f0").writeInt(value1);
+ writer.bigInt("f1").writeBigInt(value2);
+ writer.end();
+ }
+
+ @Test
+ public void testEncodeStruct() {
+ // Create a new value vector
+ try (final StructVector vector = StructVector.empty("vector", allocator);
+ final StructVector dictionaryVector = StructVector.empty("dict", allocator);) {
+ vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+ vector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
+ dictionaryVector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+ dictionaryVector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
+
+ NullableStructWriter writer = vector.getWriter();
+ writer.allocate();
+
+ writeStructVector(writer, 1, 10L);
+ writeStructVector(writer, 1, 10L);
+ writeStructVector(writer, 1, 10L);
+ writeStructVector(writer, 2, 20L);
+ writeStructVector(writer, 2, 20L);
+ writeStructVector(writer, 2, 20L);
+ writeStructVector(writer, 1, 10L);
+
+ writer.setValueCount(7);
+
+ NullableStructWriter dictWriter = dictionaryVector.getWriter();
+ dictWriter.allocate();
+
+ writeStructVector(dictWriter, 1, 10L);
+ writeStructVector(dictWriter, 2, 20L);
+
+
+ dictionaryVector.setValueCount(2);
+
+ Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+ try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
+ // verify indices
+ assertEquals(IntVector.class, encoded.getClass());
+
+ IntVector index = ((IntVector)encoded);
+ assertEquals(7, index.getValueCount());
+ assertEquals(0, index.get(0));
+ assertEquals(0, index.get(1));
+ assertEquals(0, index.get(2));
+ assertEquals(1, index.get(3));
+ assertEquals(1, index.get(4));
+ assertEquals(1, index.get(5));
+ assertEquals(0, index.get(6));
+
+ // now run through the decoder and verify we get the original back
+ try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+ assertEquals(vector.getClass(), decoded.getClass());
+ assertEquals(vector.getValueCount(), decoded.getValueCount());
+ for (int i = 0; i < 5; i++) {
+ assertEquals(vector.getObject(i), decoded.getObject(i));
+ }
+ }
+ }
+ }
+ }
+
}