You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/19 03:41:09 UTC
[arrow] branch master updated: ARROW-5902: [Java] Implement hash
table and equals & hashCode API for dictionary encoding
This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e1559f2 ARROW-5902: [Java] Implement hash table and equals & hashCode API for dictionary encoding
e1559f2 is described below
commit e1559f21abfd385b56b9e5c18ba31c997ee581d7
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Thu Jul 18 20:36:54 2019 -0700
ARROW-5902: [Java] Implement hash table and equals & hashCode API for dictionary encoding
As discussed in https://github.com/apache/arrow/pull/4792
Implement a hash table to only store hash & index, meanwhile add check equal function in ValueVector API.
Author: tianchen <ni...@alibaba-inc.com>
Closes #4846 from tianchen92/hasher and squashes the following commits:
2db730268 <tianchen> fix
5facc2a51 <tianchen> resolve comments
175192a38 <tianchen> fix test and style
7a87526e3 <tianchen> implementation of equals and hashCode
c89608b7a <tianchen> fix
8f2e1a2f7 <tianchen> hash table prototype
---
.../src/main/codegen/templates/UnionVector.java | 51 +++-
.../apache/arrow/vector/BaseFixedWidthVector.java | 30 +++
.../arrow/vector/BaseVariableWidthVector.java | 30 +++
.../java/org/apache/arrow/vector/ValueVector.java | 14 ++
.../org/apache/arrow/vector/VarCharVector.java | 1 -
.../java/org/apache/arrow/vector/ZeroVector.java | 10 +
.../arrow/vector/complex/FixedSizeListVector.java | 32 +++
.../apache/arrow/vector/complex/ListVector.java | 43 ++++
.../vector/complex/NonNullableStructVector.java | 51 ++++
.../apache/arrow/vector/complex/StructVector.java | 10 +
.../arrow/vector/dictionary/DictionaryEncoder.java | 26 +-
.../vector/dictionary/DictionaryHashTable.java | 268 +++++++++++++++++++++
.../arrow/vector/util/ByteFunctionHelpers.java | 42 ++++
.../arrow/vector/types/pojo/TestExtensionType.java | 11 +
14 files changed, 587 insertions(+), 32 deletions(-)
diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java
index 04eed72..b05005d 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -17,6 +17,7 @@
import io.netty.buffer.ArrowBuf;
import org.apache.arrow.memory.ReferenceManager;
+import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.types.pojo.FieldType;
<@pp.dropOutputFile />
@@ -491,32 +492,35 @@ public class UnionVector implements FieldVector {
return vectors.iterator();
}
-
- public Object getObject(int index) {
+ private ValueVector getVector(int index) {
int type = typeBuffer.getByte(index * TYPE_WIDTH);
switch (MinorType.values()[type]) {
- case NULL:
- return null;
+ case NULL:
+ return null;
<#list vv.types as type>
<#list type.minor as minor>
<#assign name = minor.class?cap_first />
<#assign fields = minor.fields!type.fields />
<#assign uncappedName = name?uncap_first/>
<#if !minor.typeParams?? >
- case ${name?upper_case}:
- return get${name}Vector().getObject(index);
+ case ${name?upper_case}:
+ return get${name}Vector();
</#if>
</#list>
</#list>
- case STRUCT:
- return getStruct().getObject(index);
- case LIST:
- return getList().getObject(index);
- default:
- throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]);
+ case STRUCT:
+ return getStruct();
+ case LIST:
+ return getList();
+ default:
+ throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]);
}
}
+ public Object getObject(int index) {
+ return getVector(index).getObject(index);
+ }
+
public byte[] get(int index) {
return null;
}
@@ -622,4 +626,27 @@ public class UnionVector implements FieldVector {
private int getTypeBufferValueCapacity() {
return typeBuffer.capacity() / TYPE_WIDTH;
}
+
+ @Override
+ public int hashCode(int index) {
+ return getVector(index).hashCode(index);
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+ UnionVector that = (UnionVector) to;
+ ValueVector leftVector = getVector(index);
+ ValueVector rightVector = that.getVector(toIndex);
+
+ if (leftVector.getClass() != rightVector.getClass()) {
+ return false;
+ }
+ return leftVector.equals(index, rightVector, toIndex);
+ }
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
index 8feca75..b0d716a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.OversizedAllocationException;
import org.apache.arrow.vector.util.TransferPair;
@@ -837,4 +838,33 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
handleSafe(thisIndex);
copyFrom(fromIndex, thisIndex, from);
}
+
+ @Override
+ public int hashCode(int index) {
+ int start = typeWidth * index;
+ int end = typeWidth * (index + 1);
+ return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end);
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+
+ BaseFixedWidthVector that = (BaseFixedWidthVector) to;
+
+ int leftStart = typeWidth * index;
+ int leftEnd = typeWidth * (index + 1);
+
+ int rightStart = typeWidth * toIndex;
+ int rightEnd = typeWidth * (toIndex + 1);
+
+ int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd,
+ that.getDataBuffer(), rightStart, rightEnd);
+ return ret == 1;
+ }
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index 5262f33..19fcc67 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -27,6 +27,7 @@ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.OutOfMemoryException;
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.OversizedAllocationException;
import org.apache.arrow.vector.util.TransferPair;
@@ -1334,4 +1335,33 @@ public abstract class BaseVariableWidthVector extends BaseValueVector
}
lastSet = thisIndex;
}
+
+ @Override
+ public int hashCode(int index) {
+ final int start = getStartOffset(index);
+ final int end = getStartOffset(index + 1);
+ return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end);
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+
+ BaseVariableWidthVector that = (BaseVariableWidthVector) to;
+
+ final int leftStart = getStartOffset(index);
+ final int leftEnd = getStartOffset(index + 1);
+
+ final int rightStart = that.getStartOffset(toIndex);
+ final int rightEnd = that.getStartOffset(toIndex + 1);
+
+ int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, leftEnd,
+ that.getDataBuffer(), rightStart, rightEnd);
+ return ret == 1;
+ }
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
index 86a381a..795493a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
@@ -237,4 +237,18 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
* @return true if element is null
*/
boolean isNull(int index);
+
+ /**
+ * Returns hashCode of element in index.
+ */
+ int hashCode(int index);
+
+ /**
+ * Check whether the element in index equals to the element in targetIndex from the target vector.
+ * @param index index to compare in this vector
+ * @param target target vector
+ * @param targetIndex index to compare in target vector
+ * @return true if equals, otherwise false.
+ */
+ boolean equals(int index, ValueVector target, int targetIndex);
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
index c012ce3..7a914d9 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
@@ -270,7 +270,6 @@ public class VarCharVector extends BaseVariableWidthVector {
setSafe(index, text.getBytes(), 0, text.getLength());
}
-
/*----------------------------------------------------------------*
| |
| vector transfer |
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
index 37784ed..b34aa4d 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
@@ -244,4 +244,14 @@ public class ZeroVector implements FieldVector {
public boolean isNull(int index) {
return false;
}
+
+ @Override
+ public int hashCode(int index) {
+ return 0;
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ return false;
+ }
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
index 1d2c8db..c665bef 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
@@ -45,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.JsonStringArrayList;
import org.apache.arrow.vector.util.OversizedAllocationException;
@@ -502,6 +503,37 @@ public class FixedSizeListVector extends BaseValueVector implements FieldVector,
return new TransferImpl((FixedSizeListVector) target);
}
+ @Override
+ public int hashCode(int index) {
+ if (isSet(index) == 0) {
+ return 0;
+ }
+ int hash = 0;
+ for (int i = 0; i < listSize; i++) {
+ hash = ByteFunctionHelpers.comebineHash(hash, vector.hashCode(index * listSize + i));
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+
+ FixedSizeListVector that = (FixedSizeListVector) to;
+
+ for (int i = 0; i < listSize; i++) {
+ if (!vector.equals(index * listSize + i, that, toIndex * listSize + i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
private class TransferImpl implements TransferPair {
FixedSizeListVector to;
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 945f50f..43b43bd 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -411,6 +411,49 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
return offsetBuffer;
}
+ @Override
+ public int hashCode(int index) {
+ if (isSet(index) == 0) {
+ return 0;
+ }
+ int hash = 0;
+ final int start = offsetBuffer.getInt(index * OFFSET_WIDTH);
+ final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH);
+ for (int i = start; i < end; i++) {
+ hash = 31 * vector.hashCode(i);
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+
+ ListVector that = (ListVector) to;
+ final int leftStart = offsetBuffer.getInt(index * OFFSET_WIDTH);
+ final int leftEnd = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH);
+
+ final int rightStart = that.offsetBuffer.getInt(toIndex * OFFSET_WIDTH);
+ final int rightEnd = that.offsetBuffer.getInt((toIndex + 1) * OFFSET_WIDTH);
+
+ if ((leftEnd - leftStart) != (rightEnd - rightStart)) {
+ return false;
+ }
+
+ for (int i = 0; i < (leftEnd - leftStart); i++) {
+ if (!vector.equals(leftStart + i, that.vector, rightStart + i)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
private class TransferImpl implements TransferPair {
ListVector to;
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
index 1ca315a..1d9b871 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
@@ -287,6 +287,56 @@ public class NonNullableStructVector extends AbstractStructVector {
}
@Override
+ public int hashCode(int index) {
+ int hash = 0;
+ for (String child : getChildFieldNames()) {
+ ValueVector v = getChild(child);
+ if (v != null && index < v.getValueCount()) {
+ hash += 31 * hash + v.hashCode(index);
+ }
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ if (to == null) {
+ return false;
+ }
+ if (this.getClass() != to.getClass()) {
+ return false;
+ }
+ NonNullableStructVector that = (NonNullableStructVector) to;
+ List<ValueVector> leftChildrens = new ArrayList<>();
+ List<ValueVector> rightChildrens = new ArrayList<>();
+
+ for (String child : getChildFieldNames()) {
+ ValueVector v = getChild(child);
+ if (v != null) {
+ leftChildrens.add(v);
+ }
+ }
+
+ for (String child : that.getChildFieldNames()) {
+ ValueVector v = that.getChild(child);
+ if (v != null) {
+ rightChildrens.add(v);
+ }
+ }
+
+ if (leftChildrens.size() != rightChildrens.size()) {
+ return false;
+ }
+
+ for (int i = 0; i < leftChildrens.size(); i++) {
+ if (!leftChildrens.get(i).equals(index, rightChildrens.get(i), toIndex)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
public boolean isNull(int index) {
return false;
}
@@ -372,4 +422,5 @@ public class NonNullableStructVector extends AbstractStructVector {
public List<FieldVector> getChildrenFromFields() {
return getChildren();
}
+
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
index a07e0d2..2ad9fb7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
@@ -480,6 +480,15 @@ public class StructVector extends NonNullableStructVector implements FieldVector
}
@Override
+ public int hashCode(int index) {
+ if (isSet(index) == 0) {
+ return 0;
+ } else {
+ return super.hashCode(index);
+ }
+ }
+
+ @Override
public void get(int index, ComplexHolder holder) {
holder.isSet = isSet(index);
if (holder.isSet == 0) {
@@ -546,4 +555,5 @@ public class StructVector extends NonNullableStructVector implements FieldVector
super.setValueCount(valueCount);
this.valueCount = valueCount;
}
+
}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 623e4f4..ed69df3 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -17,7 +17,6 @@
package org.apache.arrow.vector.dictionary;
-import org.apache.arrow.vector.BaseBinaryVector;
import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueVector;
@@ -44,15 +43,11 @@ public class DictionaryEncoder {
*/
public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
validateType(vector.getMinorType());
- // load dictionary values into a hashmap for lookup
- DictionaryEncodeHashMap<Object> lookUps = new DictionaryEncodeHashMap<>(dictionary.getVector().getValueCount());
-
- boolean binaryType = isBinaryType(vector.getMinorType());
+ // load dictionary indices into a hashmap for lookup
+ DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector());
for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
- Object key = binaryType ? ((BaseBinaryVector) dictionary.getVector()).getByteArrayWrapper(i) :
- dictionary.getVector().getObject(i);
- lookUps.put(key, i);
+ hashTable.put(i);
}
Field valueField = vector.getField();
@@ -73,12 +68,12 @@ public class DictionaryEncoder {
int count = vector.getValueCount();
for (int i = 0; i < count; i++) {
- Object value = binaryType ? ((BaseBinaryVector) vector).getByteArrayWrapper(i) : vector.getObject(i);
- if (value != null) { // if it's null leave it null
+ if (!vector.isNull(i)) { // if it's null leave it null
// note: this may fail if value was not included in the dictionary
- int encoded = lookUps.get(value);
+ //int encoded = lookUps.get(value);
+ int encoded = hashTable.getIndex(i, vector);
if (encoded == -1) {
- throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value);
+ throw new IllegalArgumentException("Dictionary encoding not defined for value:" + vector.getObject(i));
}
indices.setWithPossibleTruncate(i, encoded);
}
@@ -119,13 +114,6 @@ public class DictionaryEncoder {
return decoded;
}
- private static boolean isBinaryType(MinorType type) {
- if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY) {
- return true;
- }
- return false;
- }
-
private static void validateType(MinorType type) {
if (type == MinorType.UNION) {
throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
new file mode 100644
index 0000000..bf0c788
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.Objects;
+
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector)
+ * It stores the index in dictionary vector and for a given index in encode vector,
+ * it could return dictionary index.
+ */
+public class DictionaryHashTable {
+
+ /**
+ * Represents a null value in map.
+ */
+ static final int NULL_VALUE = -1;
+
+ /**
+ * The default initial capacity - MUST be a power of two.
+ */
+ static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;
+
+ /**
+ * The maximum capacity, used if a higher value is implicitly specified
+ * by either of the constructors with arguments.
+ */
+ static final int MAXIMUM_CAPACITY = 1 << 30;
+
+ /**
+ * The load factor used when none specified in constructor.
+ */
+ static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+ static final DictionaryHashTable.Entry[] EMPTY_TABLE = {};
+
+ /**
+ * The table, initialized on first use, and resized as
+ * necessary. When allocated, length is always a power of two.
+ */
+ transient DictionaryHashTable.Entry[] table = (DictionaryHashTable.Entry[]) EMPTY_TABLE;
+
+ /**
+ * The number of key-value mappings contained in this map.
+ */
+ transient int size;
+
+ /**
+ * The next size value at which to resize (capacity * load factor).
+ */
+ int threshold;
+
+ /**
+ * The load factor for the hash table.
+ */
+ final float loadFactor;
+
+ private final ValueVector dictionary;
+
+ /**
+ * Constructs an empty map with the specified initial capacity and load factor.
+ */
+ public DictionaryHashTable(int initialCapacity, ValueVector dictionary) {
+ if (initialCapacity < 0) {
+ throw new IllegalArgumentException("Illegal initial capacity: " +
+ initialCapacity);
+ }
+ if (initialCapacity > MAXIMUM_CAPACITY) {
+ initialCapacity = MAXIMUM_CAPACITY;
+ }
+ this.loadFactor = DEFAULT_LOAD_FACTOR;
+ this.threshold = initialCapacity;
+
+ this.dictionary = dictionary;
+ }
+
+ public DictionaryHashTable(ValueVector dictionary) {
+ this(DEFAULT_INITIAL_CAPACITY, dictionary);
+ }
+
+ /**
+ * Compute the capacity with given threshold and create init table.
+ */
+ private void inflateTable(int threshold) {
+ int capacity = roundUpToPowerOf2(threshold);
+ this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1);
+ table = new DictionaryHashTable.Entry[capacity];
+ }
+
+ /**
+ * Computes the storage location in an array for the given hashCode.
+ */
+ static int indexFor(int h, int length) {
+ return h & (length - 1);
+ }
+
+ /**
+ * Returns a power of two size for the given size.
+ */
+ static final int roundUpToPowerOf2(int size) {
+ int n = size - 1;
+ n |= n >>> 1;
+ n |= n >>> 2;
+ n |= n >>> 4;
+ n |= n >>> 8;
+ n |= n >>> 16;
+ return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
+ }
+
+ /**
+ * get the corresponding dictionary index with the given index in vector which to encode.
+ * @param indexInArray index in vector.
+ * @return dictionary vector index or -1 if no value equals.
+ */
+ public int getIndex(int indexInArray, ValueVector toEncode) {
+ int hash = toEncode.hashCode(indexInArray);
+ int index = indexFor(hash, table.length);
+ for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) {
+ if ((e.hash == hash)) {
+ int dictIndex = e.index;
+ if (dictionary.equals(dictIndex, toEncode, indexInArray)) {
+ return dictIndex;
+ }
+ }
+ }
+ return NULL_VALUE;
+ }
+
+ /**
+ * put the index of dictionary vector to build hash table.
+ */
+ public void put(int indexInDictionary) {
+ if (table == EMPTY_TABLE) {
+ inflateTable(threshold);
+ }
+
+ int hash = dictionary.hashCode(indexInDictionary);
+ int i = indexFor(hash, table.length);
+ for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) {
+ if (e.hash == hash && e.index == indexInDictionary) {
+ //already has this index, return
+ return;
+ }
+ }
+
+ addEntry(hash, indexInDictionary, i);
+ }
+
+ /**
+ * Create a new Entry at the specific position of table.
+ */
+ void createEntry(int hash, int index, int bucketIndex) {
+ DictionaryHashTable.Entry e = table[bucketIndex];
+ table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e);
+ size++;
+ }
+
+ /**
+ * Add Entry at the specified location of the table.
+ */
+ void addEntry(int hash, int index, int bucketIndex) {
+ if ((size >= threshold) && (null != table[bucketIndex])) {
+ resize(2 * table.length);
+ bucketIndex = indexFor(hash, table.length);
+ }
+
+ createEntry(hash, index, bucketIndex);
+ }
+
+ /**
+ * Resize table with given new capacity.
+ */
+ void resize(int newCapacity) {
+ DictionaryHashTable.Entry[] oldTable = table;
+ int oldCapacity = oldTable.length;
+ if (oldCapacity == MAXIMUM_CAPACITY) {
+ threshold = Integer.MAX_VALUE;
+ return;
+ }
+
+ DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity];
+ transfer(newTable);
+ table = newTable;
+ threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
+ }
+
+ /**
+ * Transfer entries into new table from old table.
+ * @param newTable new table
+ */
+ void transfer(DictionaryHashTable.Entry[] newTable) {
+ int newCapacity = newTable.length;
+ for (DictionaryHashTable.Entry e : table) {
+ while (null != e) {
+ DictionaryHashTable.Entry next = e.next;
+ int i = indexFor(e.hash, newCapacity);
+ e.next = newTable[i];
+ newTable[i] = e;
+ e = next;
+ }
+ }
+ }
+
+ /**
+ * Returns the number of mappings in this Map.
+ */
+ public int size() {
+ return size;
+ }
+
+ /**
+ * Removes all elements from this map, leaving it empty.
+ */
+ public void clear() {
+ size = 0;
+ for (int i = 0; i < table.length; i++) {
+ table[i] = null;
+ }
+ }
+
+ /**
+ * Class to keep dictionary index data within hash table.
+ */
+ static class Entry {
+ //dictionary index
+ int index;
+ DictionaryHashTable.Entry next;
+ int hash;
+
+ Entry(int hash, int index, DictionaryHashTable.Entry next) {
+ this.index = index;
+ this.hash = hash;
+ this.next = next;
+ }
+
+ public final int getIndex() {
+ return this.index;
+ }
+
+ public final boolean equals(Object o) {
+ if (!(o instanceof DictionaryEncodeHashMap.Entry)) {
+ return false;
+ }
+ DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o;
+ if (Objects.equals(index, e.getIndex())) {
+ return true;
+ }
+ return false;
+ }
+ }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
index fba32b6..8dbdc49 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
@@ -49,6 +49,48 @@ public class ByteFunctionHelpers {
return memEqual(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), rStart, rEnd);
}
+ /**
+ * Compute hashCode with the given {@link ArrowBuf} and start/end index.
+ */
+ public static final int hash(final ArrowBuf buf, int start, int end) {
+ long addr = buf.memoryAddress();
+ int len = end - start;
+ long pos = addr + start;
+
+ int hash = 0;
+
+ while (len > 7) {
+ long value = PlatformDependent.getLong(pos);
+ hash = comebineHash(hash, Long.hashCode(value));
+
+ pos += 8;
+ len -= 8;
+ }
+
+ while (len > 3) {
+ int value = PlatformDependent.getInt(pos);
+ hash = comebineHash(hash, value);
+
+ pos += 4;
+ len -= 4;
+ }
+
+ while (len-- != 0) {
+ byte value = PlatformDependent.getByte(pos);
+ hash = comebineHash(hash, value);
+ pos ++;
+ }
+
+ return hash;
+ }
+
+ /**
+ * Generate a new hashCode with the given current hashCode and new hashCode.
+ */
+ public static int comebineHash(int currentHash, int newHash) {
+ return currentHash * 31 + newHash;
+ }
+
private static int memEqual(final long laddr, int lStart, int lEnd, final long raddr, int rStart,
final int rEnd) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
index 20d270c..792bd29 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
@@ -34,6 +34,7 @@ import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ExtensionTypeVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.FixedSizeBinaryVector;
+import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.ipc.ArrowFileReader;
import org.apache.arrow.vector.ipc.ArrowFileWriter;
@@ -204,6 +205,16 @@ public class TestExtensionType {
return new UUID(bb.getLong(), bb.getLong());
}
+ @Override
+ public int hashCode(int index) {
+ return getUnderlyingVector().hashCode(index);
+ }
+
+ @Override
+ public boolean equals(int index, ValueVector to, int toIndex) {
+ return getUnderlyingVector().equals(index, to, toIndex);
+ }
+
public void set(int index, UUID uuid) {
ByteBuffer bb = ByteBuffer.allocate(16);
bb.putLong(uuid.getMostSignificantBits());