You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/07 21:44:48 UTC

arrow git commit: ARROW-366 Java Dictionary Vector

Repository: arrow
Updated Branches:
  refs/heads/master e97fbe640 -> c322cbf22


ARROW-366 Java Dictionary Vector

I've added a dictionary type, and a partial implementation of a dictionary vector that just wraps an index vector and has a reference to a lookup vector. The spec seems to indicate that any array can be dictionary encoded, but the C++ implementation created a new type, so I went that way.
Feedback would be appreciated - I want to make sure I'm on the right path.

Author: Emilio Lahr-Vivaz <el...@ccri.com>

Closes #309 from elahrvivaz/ARROW-366 and squashes the following commits:

60836ea [Emilio Lahr-Vivaz] removing dictionary ID from encoded vector
0871e13 [Emilio Lahr-Vivaz] ARROW-366 Adding Java dictionary vector


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c322cbf2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c322cbf2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c322cbf2

Branch: refs/heads/master
Commit: c322cbf225b5da5e17ceec0e9e7373852bcba85c
Parents: e97fbe6
Author: Emilio Lahr-Vivaz <el...@ccri.com>
Authored: Tue Feb 7 16:44:35 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Feb 7 16:44:35 2017 -0500

----------------------------------------------------------------------
 .../arrow/vector/complex/DictionaryVector.java  | 229 +++++++++++++++++++
 .../apache/arrow/vector/types/Dictionary.java   |  40 ++++
 .../apache/arrow/vector/types/pojo/Field.java   |  35 ++-
 .../java/org/apache/arrow/vector/util/Text.java |  31 ++-
 .../arrow/vector/TestDictionaryVector.java      | 154 +++++++++++++
 5 files changed, 482 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c322cbf2/java/vector/src/main/java/org/apache/arrow/vector/complex/DictionaryVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/DictionaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/DictionaryVector.java
new file mode 100644
index 0000000..84760ea
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/DictionaryVector.java
@@ -0,0 +1,229 @@
+/*******************************************************************************
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.arrow.vector.complex;
+
+import io.netty.buffer.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.OutOfMemoryException;
+import org.apache.arrow.vector.NullableIntVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.complex.reader.FieldReader;
+import org.apache.arrow.vector.types.Dictionary;
+import org.apache.arrow.vector.types.Types.MinorType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.TransferPair;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+public class DictionaryVector implements ValueVector {
+
+  private ValueVector indices;
+  private Dictionary dictionary;
+
+  public DictionaryVector(ValueVector indices, Dictionary dictionary) {
+    this.indices = indices;
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Dictionary encodes a vector. The dictionary will be built using the values from the vector.
+   *
+   * @param vector vector to encode
+   * @return dictionary encoded vector
+   */
+  public static DictionaryVector encode(ValueVector vector) {
+    validateType(vector.getMinorType());
+    Map<Object, Integer> lookUps = new HashMap<>();
+    Map<Integer, Integer> transfers = new HashMap<>();
+
+    ValueVector.Accessor accessor = vector.getAccessor();
+    int count = accessor.getValueCount();
+
+    NullableIntVector indices = new NullableIntVector(vector.getField().getName(), vector.getAllocator());
+    indices.allocateNew(count);
+    NullableIntVector.Mutator mutator = indices.getMutator();
+
+    int nextIndex = 0;
+    for (int i = 0; i < count; i++) {
+      Object value = accessor.getObject(i);
+      if (value != null) { // if it's null leave it null
+        Integer index = lookUps.get(value);
+        if (index == null) {
+          index = nextIndex++;
+          lookUps.put(value, index);
+          transfers.put(i, index);
+        }
+        mutator.set(i, index);
+      }
+    }
+    mutator.setValueCount(count);
+
+    // copy the dictionary values into the dictionary vector
+    TransferPair dictionaryTransfer = vector.getTransferPair(vector.getAllocator());
+    ValueVector dictionaryVector = dictionaryTransfer.getTo();
+    dictionaryVector.allocateNewSafe();
+    for (Map.Entry<Integer, Integer> entry: transfers.entrySet()) {
+      dictionaryTransfer.copyValueSafe(entry.getKey(), entry.getValue());
+    }
+    dictionaryVector.getMutator().setValueCount(transfers.size());
+    Dictionary dictionary = new Dictionary(dictionaryVector, false);
+
+    return new DictionaryVector(indices, dictionary);
+  }
+
+  /**
+   * Dictionary encodes a vector with a provided dictionary. The dictionary must contain all values in the vector.
+   *
+   * @param vector vector to encode
+   * @param dictionary dictionary used for encoding
+   * @return dictionary encoded vector
+   */
+  public static DictionaryVector encode(ValueVector vector, Dictionary dictionary) {
+    validateType(vector.getMinorType());
+    // load dictionary values into a hashmap for lookup
+    ValueVector.Accessor dictionaryAccessor = dictionary.getDictionary().getAccessor();
+    Map<Object, Integer> lookUps = new HashMap<>(dictionaryAccessor.getValueCount());
+    for (int i = 0; i < dictionaryAccessor.getValueCount(); i++) {
+      // for primitive array types we need a wrapper that implements equals and hashcode appropriately
+      lookUps.put(dictionaryAccessor.getObject(i), i);
+    }
+
+    // vector to hold our indices (dictionary encoded values)
+    NullableIntVector indices = new NullableIntVector(vector.getField().getName(), vector.getAllocator());
+    NullableIntVector.Mutator mutator = indices.getMutator();
+
+    ValueVector.Accessor accessor = vector.getAccessor();
+    int count = accessor.getValueCount();
+
+    indices.allocateNew(count);
+
+    for (int i = 0; i < count; i++) {
+      Object value = accessor.getObject(i);
+      if (value != null) { // if it's null leave it null
+        // note: this may fail if value was not included in the dictionary
+        mutator.set(i, lookUps.get(value));
+      }
+    }
+    mutator.setValueCount(count);
+
+    return new DictionaryVector(indices, dictionary);
+  }
+
+  /**
+   * Decodes a dictionary encoded array using the provided dictionary.
+   *
+   * @param indices dictionary encoded values, must be int type
+   * @param dictionary dictionary used to decode the values
+   * @return vector with values restored from dictionary
+   */
+  public static ValueVector decode(ValueVector indices, Dictionary dictionary) {
+    ValueVector.Accessor accessor = indices.getAccessor();
+    int count = accessor.getValueCount();
+    ValueVector dictionaryVector = dictionary.getDictionary();
+    // copy the dictionary values into the decoded vector
+    TransferPair transfer = dictionaryVector.getTransferPair(indices.getAllocator());
+    transfer.getTo().allocateNewSafe();
+    for (int i = 0; i < count; i++) {
+      Object index = accessor.getObject(i);
+      if (index != null) {
+        transfer.copyValueSafe(((Number) index).intValue(), i);
+      }
+    }
+
+    ValueVector decoded = transfer.getTo();
+    decoded.getMutator().setValueCount(count);
+    return decoded;
+  }
+
+  private static void validateType(MinorType type) {
+    // byte arrays don't work as keys in our dictionary map - we could wrap them with something to
+    // implement equals and hashcode if we want that functionality
+    if (type == MinorType.VARBINARY || type == MinorType.LIST || type == MinorType.MAP || type == MinorType.UNION) {
+      throw new IllegalArgumentException("Dictionary encoding for complex types not implemented");
+    }
+  }
+
+  public ValueVector getIndexVector() { return indices; }
+
+  public ValueVector getDictionaryVector() { return dictionary.getDictionary(); }
+
+  public Dictionary getDictionary() { return dictionary; }
+
+  @Override
+  public MinorType getMinorType() { return indices.getMinorType(); }
+
+  @Override
+  public Field getField() { return indices.getField(); }
+
+  // note: dictionary vector is not closed, as it may be shared
+  @Override
+  public void close() { indices.close(); }
+
+  @Override
+  public void allocateNew() throws OutOfMemoryException { indices.allocateNew(); }
+
+  @Override
+  public boolean allocateNewSafe() { return indices.allocateNewSafe(); }
+
+  @Override
+  public BufferAllocator getAllocator() { return indices.getAllocator();  }
+
+  @Override
+  public void setInitialCapacity(int numRecords) { indices.setInitialCapacity(numRecords); }
+
+  @Override
+  public int getValueCapacity() { return indices.getValueCapacity(); }
+
+  @Override
+  public int getBufferSize() { return indices.getBufferSize(); }
+
+  @Override
+  public int getBufferSizeFor(int valueCount) { return indices.getBufferSizeFor(valueCount); }
+
+  @Override
+  public Iterator<ValueVector> iterator() {
+    return indices.iterator();
+  }
+
+  @Override
+  public void clear() { indices.clear(); }
+
+  @Override
+  public TransferPair getTransferPair(BufferAllocator allocator) { return indices.getTransferPair(allocator); }
+
+  @Override
+  public TransferPair getTransferPair(String ref, BufferAllocator allocator) { return indices.getTransferPair(ref, allocator); }
+
+  @Override
+  public TransferPair makeTransferPair(ValueVector target) { return indices.makeTransferPair(target); }
+
+  @Override
+  public Accessor getAccessor() { return indices.getAccessor(); }
+
+  @Override
+  public Mutator getMutator() { return indices.getMutator(); }
+
+  @Override
+  public FieldReader getReader() { return indices.getReader(); }
+
+  @Override
+  public ArrowBuf[] getBuffers(boolean clear) { return indices.getBuffers(clear); }
+}

http://git-wip-us.apache.org/repos/asf/arrow/blob/c322cbf2/java/vector/src/main/java/org/apache/arrow/vector/types/Dictionary.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Dictionary.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Dictionary.java
new file mode 100644
index 0000000..fbe1345
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Dictionary.java
@@ -0,0 +1,40 @@
+/*******************************************************************************
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.arrow.vector.types;
+
+import org.apache.arrow.vector.ValueVector;
+
+public class Dictionary {
+
+    private ValueVector dictionary;
+    private boolean ordered;
+
+    public Dictionary(ValueVector dictionary, boolean ordered) {
+        this.dictionary = dictionary;
+        this.ordered = ordered;
+    }
+
+    public ValueVector getDictionary() {
+        return dictionary;
+    }
+
+    public boolean isOrdered() {
+        return ordered;
+    }
+}

http://git-wip-us.apache.org/repos/asf/arrow/blob/c322cbf2/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java
index 412fc54..2d528e4 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java
@@ -24,6 +24,9 @@ import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField;
 import java.util.List;
 import java.util.Objects;
 
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
+import org.apache.arrow.flatbuf.DictionaryEncoding;
 import org.apache.arrow.vector.schema.TypeLayout;
 import org.apache.arrow.vector.schema.VectorLayout;
 
@@ -37,6 +40,7 @@ public class Field {
   private final String name;
   private final boolean nullable;
   private final ArrowType type;
+  private final Long dictionary;
   private final List<Field> children;
   private final TypeLayout typeLayout;
 
@@ -45,11 +49,13 @@ public class Field {
       @JsonProperty("name") String name,
       @JsonProperty("nullable") boolean nullable,
       @JsonProperty("type") ArrowType type,
+      @JsonProperty("dictionary") Long dictionary,
       @JsonProperty("children") List<Field> children,
       @JsonProperty("typeLayout") TypeLayout typeLayout) {
     this.name = name;
     this.nullable = nullable;
     this.type = checkNotNull(type);
+    this.dictionary = dictionary;
     if (children == null) {
       this.children = ImmutableList.of();
     } else {
@@ -59,13 +65,22 @@ public class Field {
   }
 
   public Field(String name, boolean nullable, ArrowType type, List<Field> children) {
-    this(name, nullable, type, children, TypeLayout.getTypeLayout(checkNotNull(type)));
+    this(name, nullable, type, null, children, TypeLayout.getTypeLayout(checkNotNull(type)));
+  }
+
+  public Field(String name, boolean nullable, ArrowType type, Long dictionary, List<Field> children) {
+    this(name, nullable, type, dictionary, children, TypeLayout.getTypeLayout(checkNotNull(type)));
   }
 
   public static Field convertField(org.apache.arrow.flatbuf.Field field) {
     String name = field.name();
     boolean nullable = field.nullable();
     ArrowType type = getTypeForField(field);
+    DictionaryEncoding dictionaryEncoding = field.dictionary();
+    Long dictionary = null;
+    if (dictionaryEncoding != null) {
+      dictionary = dictionaryEncoding.id();
+    }
     ImmutableList.Builder<org.apache.arrow.vector.schema.VectorLayout> layout = ImmutableList.builder();
     for (int i = 0; i < field.layoutLength(); ++i) {
       layout.add(new org.apache.arrow.vector.schema.VectorLayout(field.layout(i)));
@@ -75,8 +90,7 @@ public class Field {
       childrenBuilder.add(convertField(field.children(i)));
     }
     List<Field> children = childrenBuilder.build();
-    Field result = new Field(name, nullable, type, children, new TypeLayout(layout.build()));
-    return result;
+    return new Field(name, nullable, type, dictionary, children, new TypeLayout(layout.build()));
   }
 
   public void validate() {
@@ -89,6 +103,11 @@ public class Field {
   public int getField(FlatBufferBuilder builder) {
     int nameOffset = name == null ? -1 : builder.createString(name);
     int typeOffset = type.getType(builder);
+    int dictionaryOffset = -1;
+    if (dictionary != null) {
+      builder.addLong(dictionary);
+      dictionaryOffset = builder.offset();
+    }
     int[] childrenData = new int[children.size()];
     for (int i = 0; i < children.size(); i++) {
       childrenData[i] = children.get(i).getField(builder);
@@ -107,6 +126,9 @@ public class Field {
     org.apache.arrow.flatbuf.Field.addNullable(builder, nullable);
     org.apache.arrow.flatbuf.Field.addTypeType(builder, type.getTypeID().getFlatbufID());
     org.apache.arrow.flatbuf.Field.addType(builder, typeOffset);
+    if (dictionary != null) {
+      org.apache.arrow.flatbuf.Field.addDictionary(builder, dictionaryOffset);
+    }
     org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset);
     org.apache.arrow.flatbuf.Field.addLayout(builder, layoutOffset);
     return org.apache.arrow.flatbuf.Field.endField(builder);
@@ -124,6 +146,9 @@ public class Field {
     return type;
   }
 
+  @JsonInclude(Include.NON_NULL)
+  public Long getDictionary() { return dictionary; }
+
   public List<Field> getChildren() {
     return children;
   }
@@ -141,6 +166,7 @@ public class Field {
     return Objects.equals(this.name, that.name) &&
             Objects.equals(this.nullable, that.nullable) &&
             Objects.equals(this.type, that.type) &&
+           Objects.equals(this.dictionary, that.dictionary) &&
             (Objects.equals(this.children, that.children) ||
                     (this.children == null && that.children.size() == 0) ||
                     (this.children.size() == 0 && that.children == null));
@@ -153,6 +179,9 @@ public class Field {
       sb.append(name).append(": ");
     }
     sb.append(type);
+    if (dictionary != null) {
+      sb.append("[dictionary: ").append(dictionary).append("]");
+    }
     if (!children.isEmpty()) {
       sb.append("<").append(Joiner.on(", ").join(children)).append(">");
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/c322cbf2/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
index 3919f06..3db4358 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
@@ -299,6 +299,11 @@ public class Text {
   /** Returns true iff <code>o</code> is a Text with the same contents. */
   @Override
   public boolean equals(Object o) {
+    if (o == this) {
+      return true;
+    } else if (o == null) {
+      return false;
+    }
     if (!(o instanceof Text)) {
       return false;
     }
@@ -308,15 +313,33 @@ public class Text {
       return false;
     }
 
-    byte[] thisBytes = Arrays.copyOf(this.getBytes(), getLength());
-    byte[] thatBytes = Arrays.copyOf(that.getBytes(), getLength());
-    return Arrays.equals(thisBytes, thatBytes);
+    // copied from Arrays.equals so we don'thave to copy the byte arrays
+    for (int i = 0; i < length; i++) {
+      if (bytes[i] != that.bytes[i]) {
+        return false;
+      }
+    }
 
+    return true;
   }
 
+  /**
+   * Copied from Arrays.hashCode so we don't have to copy the byte array
+   *
+   * @return
+   */
   @Override
   public int hashCode() {
-    return super.hashCode();
+    if (bytes == null) {
+      return 0;
+    }
+
+    int result = 1;
+    for (int i = 0; i < length; i++) {
+      result = 31 * result + bytes[i];
+    }
+
+    return result;
   }
 
   // / STATIC UTILITIES FROM HERE DOWN

http://git-wip-us.apache.org/repos/asf/arrow/blob/c322cbf2/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
new file mode 100644
index 0000000..962950a
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.vector;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.complex.DictionaryVector;
+import org.apache.arrow.vector.types.Dictionary;
+import org.apache.arrow.vector.types.Types.MinorType;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+public class TestDictionaryVector {
+
+  private BufferAllocator allocator;
+
+  byte[] zero = "foo".getBytes(StandardCharsets.UTF_8);
+  byte[] one  = "bar".getBytes(StandardCharsets.UTF_8);
+  byte[] two  = "baz".getBytes(StandardCharsets.UTF_8);
+
+  @Before
+  public void init() {
+    allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100);
+  }
+
+  @After
+  public void terminate() throws Exception {
+    allocator.close();
+  }
+
+  @Test
+  public void testEncodeStringsWithGeneratedDictionary() {
+    // Create a new value vector
+    try (final NullableVarCharVector vector = (NullableVarCharVector) MinorType.VARCHAR.getNewVector("foo", allocator, null)) {
+      final NullableVarCharVector.Mutator m = vector.getMutator();
+      vector.allocateNew(512, 5);
+
+      // set some values
+      m.setSafe(0, zero, 0, zero.length);
+      m.setSafe(1, one, 0, one.length);
+      m.setSafe(2, one, 0, one.length);
+      m.setSafe(3, two, 0, two.length);
+      m.setSafe(4, zero, 0, zero.length);
+      m.setValueCount(5);
+
+      DictionaryVector encoded = DictionaryVector.encode(vector);
+
+      try {
+        // verify values in the dictionary
+        ValueVector dictionary = encoded.getDictionaryVector();
+        assertEquals(vector.getClass(), dictionary.getClass());
+
+        NullableVarCharVector.Accessor dictionaryAccessor = ((NullableVarCharVector) dictionary).getAccessor();
+        assertEquals(3, dictionaryAccessor.getValueCount());
+        assertArrayEquals(zero, dictionaryAccessor.get(0));
+        assertArrayEquals(one, dictionaryAccessor.get(1));
+        assertArrayEquals(two, dictionaryAccessor.get(2));
+
+        // verify indices
+        ValueVector indices = encoded.getIndexVector();
+        assertEquals(NullableIntVector.class, indices.getClass());
+
+        NullableIntVector.Accessor indexAccessor = ((NullableIntVector) indices).getAccessor();
+        assertEquals(5, indexAccessor.getValueCount());
+        assertEquals(0, indexAccessor.get(0));
+        assertEquals(1, indexAccessor.get(1));
+        assertEquals(1, indexAccessor.get(2));
+        assertEquals(2, indexAccessor.get(3));
+        assertEquals(0, indexAccessor.get(4));
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryVector.decode(indices, encoded.getDictionary())) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i));
+          }
+        }
+      } finally {
+        encoded.getDictionaryVector().close();
+        encoded.getIndexVector().close();
+      }
+    }
+  }
+
+  @Test
+  public void testEncodeStringsWithProvidedDictionary() {
+    // Create a new value vector
+    try (final NullableVarCharVector vector = (NullableVarCharVector) MinorType.VARCHAR.getNewVector("foo", allocator, null);
+         final NullableVarCharVector dictionary = (NullableVarCharVector) MinorType.VARCHAR.getNewVector("dict", allocator, null)) {
+      final NullableVarCharVector.Mutator m = vector.getMutator();
+      vector.allocateNew(512, 5);
+
+      // set some values
+      m.setSafe(0, zero, 0, zero.length);
+      m.setSafe(1, one, 0, one.length);
+      m.setSafe(2, one, 0, one.length);
+      m.setSafe(3, two, 0, two.length);
+      m.setSafe(4, zero, 0, zero.length);
+      m.setValueCount(5);
+
+      // set some dictionary values
+      final NullableVarCharVector.Mutator m2 = dictionary.getMutator();
+      dictionary.allocateNew(512, 3);
+      m2.setSafe(0, zero, 0, zero.length);
+      m2.setSafe(1, one, 0, one.length);
+      m2.setSafe(2, two, 0, two.length);
+      m2.setValueCount(3);
+
+      try(final DictionaryVector encoded = DictionaryVector.encode(vector, new Dictionary(dictionary, false))) {
+        // verify indices
+        ValueVector indices = encoded.getIndexVector();
+        assertEquals(NullableIntVector.class, indices.getClass());
+
+        NullableIntVector.Accessor indexAccessor = ((NullableIntVector) indices).getAccessor();
+        assertEquals(5, indexAccessor.getValueCount());
+        assertEquals(0, indexAccessor.get(0));
+        assertEquals(1, indexAccessor.get(1));
+        assertEquals(1, indexAccessor.get(2));
+        assertEquals(2, indexAccessor.get(3));
+        assertEquals(0, indexAccessor.get(4));
+
+        // now run through the decoder and verify we get the original back
+        try (ValueVector decoded = DictionaryVector.decode(indices, encoded.getDictionary())) {
+          assertEquals(vector.getClass(), decoded.getClass());
+          assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount());
+          for (int i = 0; i < 5; i++) {
+            assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i));
+          }
+        }
+      }
+    }
+  }
+}