You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/01 20:50:34 UTC
arrow git commit: ARROW-1407: Fix bug where DictionaryEncoder can only encode vector le…
Repository: arrow
Updated Branches:
refs/heads/master 75d1f613c -> 4956e90a7
ARROW-1407: Fix bug where DictionaryEncoder can only encode vector le…
…ss than 4096 elements
Author: Li Jin <ic...@gmail.com>
Closes #1024 from icexelloss/dict-bug-ARROW-1407 and squashes the following commits:
b64258ce [Li Jin] Minor style change
e73ae599 [Li Jin] ARROW-1407: Fix bug where DictionaryEncoder can only encode vector less than 4096 elements
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/4956e90a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/4956e90a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/4956e90a
Branch: refs/heads/master
Commit: 4956e90a7c08fdf5b40b5a71253fafa4aacde434
Parents: 75d1f61
Author: Li Jin <ic...@gmail.com>
Authored: Fri Sep 1 16:50:30 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Sep 1 16:50:30 2017 -0400
----------------------------------------------------------------------
.../vector/dictionary/DictionaryEncoder.java | 2 +-
.../arrow/vector/TestDictionaryVector.java | 48 ++++++++++++++++++++
2 files changed, 49 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 7e20794..3b7dc4a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -68,7 +68,7 @@ public class DictionaryEncoder {
Method setter = null;
for (Class<?> c : ImmutableList.of(int.class, long.class)) {
try {
- setter = mutator.getClass().getMethod("set", int.class, c);
+ setter = mutator.getClass().getMethod("setSafe", int.class, c);
break;
} catch (NoSuchMethodException e) {
// ignore
http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
index f2db9ba..f8c16e7 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -39,6 +39,8 @@ public class TestDictionaryVector {
byte[] one = "bar".getBytes(StandardCharsets.UTF_8);
byte[] two = "baz".getBytes(StandardCharsets.UTF_8);
+ byte[][] data = new byte[][] {zero, one, two};
+
@Before
public void init() {
allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100);
@@ -98,4 +100,50 @@ public class TestDictionaryVector {
}
}
}
+
+ @Test
+ public void testEncodeLargeVector() {
+ // Create a new value vector
+ try (final NullableVarCharVector vector = newNullableVarCharVector("foo", allocator);
+ final NullableVarCharVector dictionaryVector = newNullableVarCharVector("dict", allocator);) {
+ final NullableVarCharVector.Mutator m = vector.getMutator();
+ vector.allocateNew();
+
+ int count = 10000;
+
+ for (int i = 0; i < 10000; ++i) {
+ vector.getMutator().setSafe(i, data[i % 3], 0, data[i % 3].length);
+ }
+ vector.getMutator().setValueCount(count);
+
+ dictionaryVector.allocateNew(512, 3);
+ dictionaryVector.getMutator().setSafe(0, zero, 0, zero.length);
+ dictionaryVector.getMutator().setSafe(1, one, 0, one.length);
+ dictionaryVector.getMutator().setSafe(2, two, 0, two.length);
+ dictionaryVector.getMutator().setValueCount(3);
+
+ Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
+
+
+ try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
+ // verify indices
+ assertEquals(NullableIntVector.class, encoded.getClass());
+
+ NullableIntVector.Accessor indexAccessor = ((NullableIntVector) encoded).getAccessor();
+ assertEquals(count, indexAccessor.getValueCount());
+ for (int i = 0; i < count; ++i) {
+ assertEquals(i % 3, indexAccessor.get(i));
+ }
+
+ // now run through the decoder and verify we get the original back
+ try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
+ assertEquals(vector.getClass(), decoded.getClass());
+ assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount());
+ for (int i = 0; i < count; ++i) {
+ assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i));
+ }
+ }
+ }
+ }
+ }
}