You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/16 02:59:51 UTC

[arrow] branch master updated: ARROW-5835: [Java] Support Dictionary Encoding for binary type

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a222c7d  ARROW-5835: [Java] Support Dictionary Encoding for binary type
a222c7d is described below

commit a222c7d4ddfa43ee35a6ecf1c1563b48ecd6b860
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Sat Jul 13 00:57:00 2019 -0700

    ARROW-5835: [Java] Support Dictionary Encoding for binary type
    
    Related to [ARROW-5835](https://issues.apache.org/jira/browse/ARROW-5835).
    Now is not implemented because byte array is not supported to be HashMap key.
    One possible way is that wrap them with something to implement equals and hashcode.
    
    Author: tianchen <ni...@alibaba-inc.com>
    
    Closes #4792 from tianchen92/ARROW-5835 and squashes the following commits:
    
    f50a19e73 <tianchen> fix UNION regression
    8267c2b2a <tianchen> fix style
    a039bc1fd <tianchen> Support Dictionary Encoding for binary type
---
 .../org/apache/arrow/vector/BaseBinaryVector.java  | 32 +++++++++++++
 .../apache/arrow/vector/FixedSizeBinaryVector.java | 12 ++++-
 .../org/apache/arrow/vector/VarBinaryVector.java   | 12 ++++-
 .../arrow/vector/dictionary/ByteArrayWrapper.java  | 52 ++++++++++++++++++++++
 .../arrow/vector/dictionary/DictionaryEncoder.java | 24 +++++++---
 .../apache/arrow/vector/TestDictionaryVector.java  | 50 +++++++++++++++++++++
 6 files changed, 173 insertions(+), 9 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseBinaryVector.java
new file mode 100644
index 0000000..9ddbd10
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseBinaryVector.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+import org.apache.arrow.vector.dictionary.ByteArrayWrapper;
+
+/**
+ * Interface for VarBinaryVector and FixedSizeBinaryVector.
+ */
+public interface BaseBinaryVector {
+
+  /**
+   * Get the {@link ByteArrayWrapper} which holds the byte array at specific index.
+   * @param index index of object to get
+   */
+  ByteArrayWrapper getByteArrayWrapper(int index);
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java
index 61bd57c..862336f 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java
@@ -22,6 +22,7 @@ import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.complex.impl.FixedSizeBinaryReaderImpl;
 import org.apache.arrow.vector.complex.reader.FieldReader;
+import org.apache.arrow.vector.dictionary.ByteArrayWrapper;
 import org.apache.arrow.vector.holders.FixedSizeBinaryHolder;
 import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder;
 import org.apache.arrow.vector.types.Types.MinorType;
@@ -37,7 +38,7 @@ import io.netty.buffer.ArrowBuf;
  * binary values which could be null. A validity buffer (bit vector) is
  * maintained to track which elements in the vector are null.
  */
-public class FixedSizeBinaryVector extends BaseFixedWidthVector {
+public class FixedSizeBinaryVector extends BaseFixedWidthVector implements BaseBinaryVector {
   private final int byteWidth;
   private final FieldReader reader;
 
@@ -363,6 +364,15 @@ public class FixedSizeBinaryVector extends BaseFixedWidthVector {
     return new TransferImpl((FixedSizeBinaryVector) to);
   }
 
+  @Override
+  public ByteArrayWrapper getByteArrayWrapper(int index) {
+    if (isNull(index)) {
+      return null;
+    } else {
+      return new ByteArrayWrapper(getObject(index));
+    }
+  }
+
   private class TransferImpl implements TransferPair {
     FixedSizeBinaryVector to;
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java
index bd76f3c..c8232cf 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java
@@ -22,6 +22,7 @@ import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.complex.impl.VarBinaryReaderImpl;
 import org.apache.arrow.vector.complex.reader.FieldReader;
+import org.apache.arrow.vector.dictionary.ByteArrayWrapper;
 import org.apache.arrow.vector.holders.NullableVarBinaryHolder;
 import org.apache.arrow.vector.holders.VarBinaryHolder;
 import org.apache.arrow.vector.types.Types.MinorType;
@@ -34,7 +35,7 @@ import org.apache.arrow.vector.util.TransferPair;
  * values which could be NULL. A validity buffer (bit vector) is maintained
  * to track which elements in the vector are null.
  */
-public class VarBinaryVector extends BaseVariableWidthVector {
+public class VarBinaryVector extends BaseVariableWidthVector implements BaseBinaryVector {
   private final FieldReader reader;
 
   /**
@@ -279,6 +280,15 @@ public class VarBinaryVector extends BaseVariableWidthVector {
     return new TransferImpl((VarBinaryVector) to);
   }
 
+  @Override
+  public ByteArrayWrapper getByteArrayWrapper(int index) {
+    if (isNull(index)) {
+      return null;
+    } else {
+      return new ByteArrayWrapper(getObject(index));
+    }
+  }
+
   private class TransferImpl implements TransferPair {
     VarBinaryVector to;
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ByteArrayWrapper.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ByteArrayWrapper.java
new file mode 100644
index 0000000..bcfac39
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ByteArrayWrapper.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.Arrays;
+
+/**
+ * Wrapper class for byte array.
+ */
+public class ByteArrayWrapper {
+  private final byte[] data;
+
+  /**
+   * Constructs a new instance.
+   */
+  public ByteArrayWrapper(byte[] data) {
+    if (data == null) {
+      throw new NullPointerException();
+    }
+
+    this.data = data;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof ByteArrayWrapper)) {
+      return false;
+    }
+
+    return Arrays.equals(data, ((ByteArrayWrapper)other).data);
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(data);
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index a28ea5b..623e4f4 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -17,6 +17,7 @@
 
 package org.apache.arrow.vector.dictionary;
 
+import org.apache.arrow.vector.BaseBinaryVector;
 import org.apache.arrow.vector.BaseIntVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.ValueVector;
@@ -45,9 +46,13 @@ public class DictionaryEncoder {
     validateType(vector.getMinorType());
     // load dictionary values into a hashmap for lookup
     DictionaryEncodeHashMap<Object> lookUps = new DictionaryEncodeHashMap<>(dictionary.getVector().getValueCount());
+
+    boolean binaryType = isBinaryType(vector.getMinorType());
+
     for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
-      // for primitive array types we need a wrapper that implements equals and hashcode appropriately
-      lookUps.put(dictionary.getVector().getObject(i), i);
+      Object key = binaryType ? ((BaseBinaryVector) dictionary.getVector()).getByteArrayWrapper(i) :
+          dictionary.getVector().getObject(i);
+      lookUps.put(key, i);
     }
 
     Field valueField = vector.getField();
@@ -68,7 +73,7 @@ public class DictionaryEncoder {
     int count = vector.getValueCount();
 
     for (int i = 0; i < count; i++) {
-      Object value = vector.getObject(i);
+      Object value = binaryType ? ((BaseBinaryVector) vector).getByteArrayWrapper(i) : vector.getObject(i);
       if (value != null) { // if it's null leave it null
         // note: this may fail if value was not included in the dictionary
         int encoded = lookUps.get(value);
@@ -114,11 +119,16 @@ public class DictionaryEncoder {
     return decoded;
   }
 
+  private static boolean isBinaryType(MinorType type) {
+    if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY) {
+      return true;
+    }
+    return false;
+  }
+
   private static void validateType(MinorType type) {
-    // byte arrays don't work as keys in our dictionary map - we could wrap them with something to
-    // implement equals and hashcode if we want that functionality
-    if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY || type == MinorType.UNION) {
-      throw new IllegalArgumentException("Dictionary encoding for complex types not implemented: type " + type);
+    if (type == MinorType.UNION) {
+      throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type);
     }
   }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index 36e763c..0d2bce9 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -17,10 +17,13 @@
 
 package org.apache.arrow.vector;
 
+import static org.apache.arrow.vector.TestUtils.newVarBinaryVector;
 import static org.apache.arrow.vector.TestUtils.newVarCharVector;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.complex.ListVector;
@@ -278,4 +281,51 @@ public class TestDictionaryVector {
     }
   }
 
+  @Test
+  public void testEncodeBinaryVector() {
+    // Create a new value vector
+    try (final VarBinaryVector vector = newVarBinaryVector("foo", allocator);
+        final VarBinaryVector dictionaryVector = newVarBinaryVector("dict", allocator);) {
+      vector.allocateNew(512, 5);
+
+      // set some values
+      vector.setSafe(0, zero, 0, zero.length);
+      vector.setSafe(1, one, 0, one.length);
+      vector.setSafe(2, one, 0, one.length);
+      vector.setSafe(3, two, 0, two.length);
+      vector.setSafe(4, zero, 0, zero.length);
+      vector.setValueCount(5);
+
+      // set some dictionary values
+      dictionaryVector.allocateNew(512, 3);
+      dictionaryVector.setSafe(0, zero, 0, zero.length);
+      dictionaryVector.setSafe(1, one, 0, one.length);
+      dictionaryVector.setSafe(2, two, 0, two.length);
+      dictionaryVector.setValueCount(3);
+
+      Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+      try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
+        // verify indices
+        assertEquals(IntVector.class, encoded.getClass());
+
+        IntVector index = ((IntVector)encoded);
+        assertEquals(5, index.getValueCount());
+        assertEquals(0, index.get(0));
+        assertEquals(1, index.get(1));
+        assertEquals(1, index.get(2));
+        assertEquals(2, index.get(3));
+        assertEquals(0, index.get(4));
+
+        // now run through the decoder and verify we get the original back
+        try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dictionary)) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getValueCount(), decoded.getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i)));
+          }
+        }
+      }
+    }
+  }
 }