You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/01 20:50:34 UTC

arrow git commit: ARROW-1407: Fix bug where DictionaryEncoder can only encode vector le…

Repository: arrow
Updated Branches:
  refs/heads/master 75d1f613c -> 4956e90a7


ARROW-1407: Fix bug where DictionaryEncoder can only encode vector le…

…ss than 4096 elements

Author: Li Jin <ic...@gmail.com>

Closes #1024 from icexelloss/dict-bug-ARROW-1407 and squashes the following commits:

b64258ce [Li Jin] Minor style change
e73ae599 [Li Jin] ARROW-1407: Fix bug where DictionaryEncoder can only encode vector less than 4096 elements


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/4956e90a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/4956e90a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/4956e90a

Branch: refs/heads/master
Commit: 4956e90a7c08fdf5b40b5a71253fafa4aacde434
Parents: 75d1f61
Author: Li Jin <ic...@gmail.com>
Authored: Fri Sep 1 16:50:30 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Sep 1 16:50:30 2017 -0400

----------------------------------------------------------------------
 .../vector/dictionary/DictionaryEncoder.java    |  2 +-
 .../arrow/vector/TestDictionaryVector.java      | 48 ++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 7e20794..3b7dc4a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -68,7 +68,7 @@ public class DictionaryEncoder {
     Method setter = null;
     for (Class<?> c : ImmutableList.of(int.class, long.class)) {
       try {
-        setter = mutator.getClass().getMethod("set", int.class, c);
+        setter = mutator.getClass().getMethod("setSafe", int.class, c);
         break;
       } catch (NoSuchMethodException e) {
         // ignore

http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index f2db9ba..f8c16e7 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -39,6 +39,8 @@ public class TestDictionaryVector {
   byte[] one = "bar".getBytes(StandardCharsets.UTF_8);
   byte[] two = "baz".getBytes(StandardCharsets.UTF_8);
 
+  byte[][] data = new byte[][] {zero, one, two};
+
   @Before
   public void init() {
     allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100);
@@ -98,4 +100,50 @@ public class TestDictionaryVector {
       }
     }
   }
+
+  @Test
+  public void testEncodeLargeVector() {
+    // Create a new value vector
+    try (final NullableVarCharVector vector = newNullableVarCharVector("foo", allocator);
+         final NullableVarCharVector dictionaryVector = newNullableVarCharVector("dict", allocator);) {
+      final NullableVarCharVector.Mutator m = vector.getMutator();
+      vector.allocateNew();
+
+      int count = 10000;
+
+      for (int i = 0; i < 10000; ++i) {
+        vector.getMutator().setSafe(i, data[i % 3], 0, data[i % 3].length);
+      }
+      vector.getMutator().setValueCount(count);
+
+      dictionaryVector.allocateNew(512, 3);
+      dictionaryVector.getMutator().setSafe(0, zero, 0, zero.length);
+      dictionaryVector.getMutator().setSafe(1, one, 0, one.length);
+      dictionaryVector.getMutator().setSafe(2, two, 0, two.length);
+      dictionaryVector.getMutator().setValueCount(3);
+
+      Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+
+      try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
+        // verify indices
+        assertEquals(NullableIntVector.class, encoded.getClass());
+
+        NullableIntVector.Accessor indexAccessor = ((NullableIntVector) encoded).getAccessor();
+        assertEquals(count, indexAccessor.getValueCount());
+        for (int i = 0; i < count; ++i) {
+          assertEquals(i % 3, indexAccessor.get(i));
+        }
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount());
+          for (int i = 0; i < count; ++i) {
+            assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i));
+          }
+        }
+      }
+    }
+  }
 }