You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2017/04/28 17:54:25 UTC
[2/2] hive git commit: HIVE-16523 : VectorHashKeyWrapper hash code
for strings is not so good (Sergey Shelukhin/Gopal Vijayaraghavan,
reviewed by Gopal/Sergey)
HIVE-16523 : VectorHashKeyWrapper hash code for strings is not so good (Sergey Shelukhin/Gopal Vijayaraghavan, reviewed by Gopal/Sergey)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e29528cf
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e29528cf
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e29528cf
Branch: refs/heads/branch-2
Commit: e29528cf06a0aae24c386c525f48276b70d54a01
Parents: 7b1f6ef
Author: sergey <se...@apache.org>
Authored: Fri Apr 28 10:51:02 2017 -0700
Committer: sergey <se...@apache.org>
Committed: Fri Apr 28 10:51:13 2017 -0700
----------------------------------------------------------------------
.../ql/exec/vector/VectorHashKeyWrapper.java | 56 ++++++++----
.../exec/vector/VectorHashKeyWrapperBatch.java | 7 +-
.../org/apache/hive/common/util/Murmur3.java | 96 ++++++++++++++++++++
3 files changed, 141 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/e29528cf/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java
index 2c51882..3e1fcdd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.hive.ql.exec.vector;
+import org.apache.hive.common.util.Murmur3;
+
import java.sql.Timestamp;
import java.util.Arrays;
@@ -41,6 +43,17 @@ import com.google.common.base.Preconditions;
*/
public class VectorHashKeyWrapper extends KeyWrapper {
+ public static final class HashContext {
+ private final Murmur3.IncrementalHash32 bytesHash = new Murmur3.IncrementalHash32();
+
+ public static Murmur3.IncrementalHash32 getBytesHash(HashContext ctx) {
+ if (ctx == null) {
+ return new Murmur3.IncrementalHash32();
+ }
+ return ctx.bytesHash;
+ }
+ }
+
private static final int[] EMPTY_INT_ARRAY = new int[0];
private static final long[] EMPTY_LONG_ARRAY = new long[0];
private static final double[] EMPTY_DOUBLE_ARRAY = new double[0];
@@ -73,10 +86,13 @@ public class VectorHashKeyWrapper extends KeyWrapper {
private int hashcode;
- private VectorHashKeyWrapper(int longValuesCount, int doubleValuesCount,
+ private HashContext hashCtx;
+
+ private VectorHashKeyWrapper(HashContext ctx, int longValuesCount, int doubleValuesCount,
int byteValuesCount, int decimalValuesCount, int timestampValuesCount,
int intervalDayTimeValuesCount,
int keyCount) {
+ hashCtx = ctx;
longValues = longValuesCount > 0 ? new long[longValuesCount] : EMPTY_LONG_ARRAY;
doubleValues = doubleValuesCount > 0 ? new double[doubleValuesCount] : EMPTY_DOUBLE_ARRAY;
decimalValues = decimalValuesCount > 0 ? new HiveDecimalWritable[decimalValuesCount] : EMPTY_DECIMAL_ARRAY;
@@ -107,14 +123,14 @@ public class VectorHashKeyWrapper extends KeyWrapper {
private VectorHashKeyWrapper() {
}
- public static VectorHashKeyWrapper allocate(int longValuesCount, int doubleValuesCount,
+ public static VectorHashKeyWrapper allocate(HashContext ctx, int longValuesCount, int doubleValuesCount,
int byteValuesCount, int decimalValuesCount, int timestampValuesCount,
int intervalDayTimeValuesCount, int keyCount) {
if ((longValuesCount + doubleValuesCount + byteValuesCount + decimalValuesCount
+ timestampValuesCount + intervalDayTimeValuesCount) == 0) {
return EMPTY_KEY_WRAPPER;
}
- return new VectorHashKeyWrapper(longValuesCount, doubleValuesCount, byteValuesCount,
+ return new VectorHashKeyWrapper(ctx, longValuesCount, doubleValuesCount, byteValuesCount,
decimalValuesCount, timestampValuesCount, intervalDayTimeValuesCount,
keyCount);
}
@@ -126,40 +142,44 @@ public class VectorHashKeyWrapper extends KeyWrapper {
@Override
public void setHashKey() {
- hashcode = Arrays.hashCode(longValues) ^
+ // compute locally and assign
+ int hash = Arrays.hashCode(longValues) ^
Arrays.hashCode(doubleValues) ^
Arrays.hashCode(isNull);
for (int i = 0; i < decimalValues.length; i++) {
// Use the new faster hash code since we are hashing memory objects.
- hashcode ^= decimalValues[i].newFasterHashCode();
+ hash ^= decimalValues[i].newFasterHashCode();
}
for (int i = 0; i < timestampValues.length; i++) {
- hashcode ^= timestampValues[i].hashCode();
+ hash ^= timestampValues[i].hashCode();
}
for (int i = 0; i < intervalDayTimeValues.length; i++) {
- hashcode ^= intervalDayTimeValues[i].hashCode();
+ hash ^= intervalDayTimeValues[i].hashCode();
}
// This code, with branches and all, is not executed if there are no string keys
+ Murmur3.IncrementalHash32 bytesHash = null;
for (int i = 0; i < byteValues.length; ++i) {
/*
* Hashing the string is potentially expensive so is better to branch.
* Additionally not looking at values for nulls allows us not reset the values.
*/
- if (byteLengths[i] != -1) {
- byte[] bytes = byteValues[i];
- int start = byteStarts[i];
- int length = byteLengths[i];
- // Unfortunately there is no Arrays.hashCode(byte[], start, length)
- for(int j = start; j < start + length; ++j) {
- // use 461 as is a (sexy!) prime.
- hashcode ^= 461 * bytes[j];
- }
+ if (byteLengths[i] == -1) {
+ continue;
}
+ if (bytesHash == null) {
+ bytesHash = HashContext.getBytesHash(hashCtx);
+ bytesHash.start(hash);
+ }
+ bytesHash.add(byteValues[i], byteStarts[i], byteLengths[i]);
+ }
+ if (bytesHash != null) {
+ hash = bytesHash.end();
}
+ this.hashcode = hash;
}
@Override
@@ -171,6 +191,7 @@ public class VectorHashKeyWrapper extends KeyWrapper {
public boolean equals(Object that) {
if (that instanceof VectorHashKeyWrapper) {
VectorHashKeyWrapper keyThat = (VectorHashKeyWrapper)that;
+ // not comparing hashCtx - irrelevant
return hashcode == keyThat.hashcode &&
Arrays.equals(longValues, keyThat.longValues) &&
Arrays.equals(doubleValues, keyThat.doubleValues) &&
@@ -211,6 +232,7 @@ public class VectorHashKeyWrapper extends KeyWrapper {
}
public void duplicateTo(VectorHashKeyWrapper clone) {
+ clone.hashCtx = hashCtx;
clone.longValues = (longValues.length > 0) ? longValues.clone() : EMPTY_LONG_ARRAY;
clone.doubleValues = (doubleValues.length > 0) ? doubleValues.clone() : EMPTY_DOUBLE_ARRAY;
clone.isNull = isNull.clone();
@@ -464,7 +486,7 @@ public class VectorHashKeyWrapper extends KeyWrapper {
public static final class EmptyVectorHashKeyWrapper extends VectorHashKeyWrapper {
private EmptyVectorHashKeyWrapper() {
- super(0, 0, 0, 0, 0, 0, /* keyCount */ 0);
+ super(null, 0, 0, 0, 0, 0, 0, /* keyCount */ 0);
// no need to override assigns - all assign ops will fail due to 0 size
}
http://git-wip-us.apache.org/repos/asf/hive/blob/e29528cf/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapperBatch.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapperBatch.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapperBatch.java
index 63cdf94..cf73a7f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapperBatch.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapperBatch.java
@@ -62,6 +62,11 @@ public class VectorHashKeyWrapperBatch extends VectorColumnSetInfo {
*/
private int keysFixedSize;
+ /**
+ * Shared hashcontext for all keys in this batch
+ */
+ private final VectorHashKeyWrapper.HashContext hashCtx = new VectorHashKeyWrapper.HashContext();
+
/**
* Returns the compiled fixed size for the key wrappers.
* @return
@@ -960,7 +965,7 @@ public class VectorHashKeyWrapperBatch extends VectorColumnSetInfo {
}
public VectorHashKeyWrapper allocateKeyWrapper() {
- return VectorHashKeyWrapper.allocate(
+ return VectorHashKeyWrapper.allocate(hashCtx,
longIndices.length,
doubleIndices.length,
stringIndices.length,
http://git-wip-us.apache.org/repos/asf/hive/blob/e29528cf/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
index 88c3514..1c56765 100644
--- a/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+++ b/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
@@ -332,4 +332,100 @@ public class Murmur3 {
h ^= (h >>> 33);
return h;
}
+
+ public static class IncrementalHash32 {
+ byte[] tail = new byte[3];
+ int tailLen;
+ int totalLen;
+ int hash;
+
+ public final void start(int hash) {
+ tailLen = totalLen = 0;
+ this.hash = hash;
+ }
+
+ public final void add(byte[] data, int offset, int length) {
+ if (length == 0) return;
+ totalLen += length;
+ if (tailLen + length < 4) {
+ System.arraycopy(data, offset, tail, tailLen, length);
+ tailLen += length;
+ return;
+ }
+ int offset2 = 0;
+ if (tailLen > 0) {
+ offset2 = (4 - tailLen);
+ int k = -1;
+ switch (tailLen) {
+ case 1:
+ k = orBytes(tail[0], data[0], data[1], data[2]);
+ break;
+ case 2:
+ k = orBytes(tail[0], tail[1], data[0], data[1]);
+ break;
+ case 3:
+ k = orBytes(tail[0], tail[1], tail[2], data[0]);
+ break;
+ default: throw new AssertionError(tailLen);
+ }
+ // mix functions
+ k *= C1_32;
+ k = Integer.rotateLeft(k, R1_32);
+ k *= C2_32;
+ hash ^= k;
+ hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+ }
+ int length2 = length - offset2;
+ offset += offset2;
+ final int nblocks = length2 >> 2;
+
+ for (int i = 0; i < nblocks; i++) {
+ int i_4 = (i << 2) + offset;
+ int k = orBytes(data[i_4], data[i_4 + 1], data[i_4 + 2], data[i_4 + 3]);
+
+ // mix functions
+ k *= C1_32;
+ k = Integer.rotateLeft(k, R1_32);
+ k *= C2_32;
+ hash ^= k;
+ hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+ }
+
+ int consumed = (nblocks << 2);
+ tailLen = length2 - consumed;
+ if (consumed == length2) return;
+ System.arraycopy(data, offset + consumed, tail, 0, tailLen);
+ }
+
+ public final int end() {
+ int k1 = 0;
+ switch (tailLen) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+
+ // mix functions
+ k1 *= C1_32;
+ k1 = Integer.rotateLeft(k1, R1_32);
+ k1 *= C2_32;
+ hash ^= k1;
+ }
+
+ // finalization
+ hash ^= totalLen;
+ hash ^= (hash >>> 16);
+ hash *= 0x85ebca6b;
+ hash ^= (hash >>> 13);
+ hash *= 0xc2b2ae35;
+ hash ^= (hash >>> 16);
+ return hash;
+ }
+ }
+
+ private static int orBytes(byte b1, byte b2, byte b3, byte b4) {
+ return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
+ }
}