You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/11/21 15:54:38 UTC
[commons-codec] 01/07: Fully documented MurmurHash3.
This is an automated email from the ASF dual-hosted git repository.
aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit b5ce7fae5bb6794d41216756f080a087cb956e90
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 14:04:30 2019 +0000
Fully documented MurmurHash3.
This attempts no code changes.
The highly modified class passes the current tests.
mvn clirr:clirr report shows no changes.
Default mvn goal passes:
mvn clean verify apache-rat:check clirr:check javadoc:javadoc
- Documented all public constants.
- Documented IncrementalHash32 class.
- Added <p> tags to paragraphs.
- Renamed input variables for consistency across all method.
- Renamed internal variables in the hash32, 64, 128 methods so the
similarities in the methods are obvious.
- prefix joavadoc @params and @returns with 'The' consistently
- Added code examples for the methods that hash primitives for the
equivalent using a ByteBuffer to encode the bytes.
- Fix links in the class header.
- Documented sign extension bugs in hash32 and hash128.
- Documented hash64 is not from the reference code and does not match
part of hash128.
- Documented hash64 as may be released in a future version.
- Refactored combination of bytes to little endian int/long into
methods.
---
.../apache/commons/codec/digest/MurmurHash3.java | 807 +++++++++++++++------
1 file changed, 568 insertions(+), 239 deletions(-)
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index 6a276f9..95ab8b1 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -20,28 +20,46 @@ package org.apache.commons.codec.digest;
/**
* MurmurHash3 yields a 32-bit or 128-bit value.
*
- * MurmurHash is a non-cryptographic hash function suitable for general
+ * <p>MurmurHash is a non-cryptographic hash function suitable for general
* hash-based lookup. The name comes from two basic operations, multiply (MU)
* and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
* it is not specifically designed to be difficult to reverse by an adversary,
- * making it unsuitable for cryptographic purposes.
+ * making it unsuitable for cryptographic purposes.</p>
*
- * 32-bit Java port of
- * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of
- * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32}
+ * and the 128-bit hash function {@code MurmurHash3_x64_128} from Austin Applyby's
+ * original {@code c++} code in SMHasher.</p>
*
- * This is a public domain code with no copyrights. From homepage of MurmurHash
- * (https://code.google.com/p/smhasher/), "All MurmurHash versions are public
- * domain software, and the author disclaims all copyright to their code."
+ * <p>This is public domain code with no copyrights. From homepage of
+ * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
*
- * Copied from Apache Hive:
- * https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+ * <blockquote>
+ * "All MurmurHash versions are public domain software, and the author
+ * disclaims all copyright to their code."
+ * </blockquote>
+ *
+ * <p>Original adaption from Apache Hive. That adaption contains a {@code hash64} method
+ * that is not part of the original MurmurHash3 code.<p>
*
* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
+ * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">
+ * Original MurmurHash3 c++ code</a>
+ * @see <a href="https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
+ * Apache Hive Murmer3</a>
* @since 1.13
*/
public final class MurmurHash3 {
+ /**
+ * A random number to use for a hash code.
+ */
+ public static final long NULL_HASHCODE = 2862933555777941757L;
+
+ /**
+ * A default seed to use for the murmur hash algorithm.
+ * Has the value {@code 104729}.
+ *
+ */
+ public static final int DEFAULT_SEED = 104729;
/** TODO Replace on Java 8 with Long.BYTES. */
static final int LONG_BYTES = Long.SIZE / Byte.SIZE;
@@ -52,10 +70,7 @@ public final class MurmurHash3 {
/** TODO Replace on Java 8 with Short.BYTES. */
static final int SHORT_BYTES = Short.SIZE / Byte.SIZE;
- // from 64-bit linear congruential generator
- public static final long NULL_HASHCODE = 2862933555777941757L;
-
- // Constants for 32 bit variant
+ // Constants for 32-bit variant
private static final int C1_32 = 0xcc9e2d51;
private static final int C2_32 = 0x1b873593;
private static final int R1_32 = 15;
@@ -63,7 +78,7 @@ public final class MurmurHash3 {
private static final int M_32 = 5;
private static final int N_32 = 0xe6546b64;
- // Constants for 128 bit variant
+ // Constants for 128-bit variant
private static final long C1 = 0x87c37b91114253d5L;
private static final long C2 = 0x4cf5ad432745937fL;
private static final int R1 = 31;
@@ -73,123 +88,223 @@ public final class MurmurHash3 {
private static final int N1 = 0x52dce729;
private static final int N2 = 0x38495ab5;
- public static final int DEFAULT_SEED = 104729;
-
- // all methods static; private constructor.
+ /** No instance methods. */
private MurmurHash3() {
}
/**
- * Generates 32 bit hash from two longs with default seed value.
+ * Generates 32-bit hash from two longs with a default seed value.
+ * This is a helper method that will produce the same result as:
*
- * @param l0 long to hash
- * @param l1 long to hash
- * @return 32 bit hash
- */
- public static int hash32(final long l0, final long l1) {
- return hash32(l0, l1, DEFAULT_SEED);
- }
-
- /**
- * Generates 32 bit hash from a long with default seed value.
+ * <pre>
+ * int seed = 104729;
+ * int hash = hash32(ByteBuffer.allocate(16)
+ * .putLong(data1)
+ * .putLong(data2)
+ * .array(), 16, seed);
+ * </pre>
*
- * @param l0 long to hash
- * @return 32 bit hash
+ * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect
+ * this result as there are no bytes left over from dividing the length by 4.<p>
+ *
+ * @param data1 The first long to hash
+ * @param data2 The second long to hash
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int)
*/
- public static int hash32(final long l0) {
- return hash32(l0, DEFAULT_SEED);
+ public static int hash32(final long data1, final long data2) {
+ return hash32(data1, data2, DEFAULT_SEED);
}
/**
- * Generates 32 bit hash from a long with the given seed.
+ * Generates 32-bit hash from two longs with the given seed.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int hash = hash32(ByteBuffer.allocate(16)
+ * .putLong(data1)
+ * .putLong(data2)
+ * .array(), 16, seed);
+ * </pre>
*
- * @param l0 long to hash
- * @param seed initial seed value
- * @return 32 bit hash
+ * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect
+ * this result as there are no bytes left over from dividing the length by 4.<p>
+ *
+ * @param data1 The first long to hash
+ * @param data2 The second long to hash
+ * @param seed The initial seed value
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int)
*/
- public static int hash32(final long l0, final int seed) {
+ public static int hash32(final long data1, final long data2, final int seed) {
int hash = seed;
- final long r0 = Long.reverseBytes(l0);
+ final long r0 = Long.reverseBytes(data1);
+ final long r1 = Long.reverseBytes(data2);
hash = mix32((int) r0, hash);
hash = mix32((int) (r0 >>> 32), hash);
+ hash = mix32((int) (r1), hash);
+ hash = mix32((int) (r1 >>> 32), hash);
- return fmix32(LONG_BYTES, hash);
+ hash ^= LONG_BYTES * 2;
+ return fmix32(hash);
}
/**
- * Generates 32 bit hash from two longs with the given seed.
+ * Generates 32-bit hash from a long with a default seed value.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * int hash = hash32(ByteBuffer.allocate(8)
+ * .putLong(data)
+ * .array(), 8, seed);
+ * </pre>
*
- * @param l0 long to hash
- * @param l1 long to hash
- * @param seed initial seed value
- * @return 32 bit hash
+ * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect
+ * this result as there are no bytes left over from dividing the length by 4.<p>
+ *
+ * @param data The long to hash
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int)
*/
- public static int hash32(final long l0, final long l1, final int seed) {
+ public static int hash32(final long data) {
+ return hash32(data, DEFAULT_SEED);
+ }
+
+ /**
+ * Generates 32-bit hash from a long with the given seed.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int hash = hash32(ByteBuffer.allocate(8)
+ * .putLong(data)
+ * .array(), 8, seed);
+ * </pre>
+ *
+ * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect
+ * this result as there are no bytes left over from dividing the length by 4.<p>
+ *
+ * @param data The long to hash
+ * @param seed The initial seed value
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int)
+ */
+ public static int hash32(final long data, final int seed) {
int hash = seed;
- final long r0 = Long.reverseBytes(l0);
- final long r1 = Long.reverseBytes(l1);
+ final long r0 = Long.reverseBytes(data);
hash = mix32((int) r0, hash);
hash = mix32((int) (r0 >>> 32), hash);
- hash = mix32((int) (r1), hash);
- hash = mix32((int) (r1 >>> 32), hash);
- return fmix32(LONG_BYTES * 2, hash);
+ hash ^= LONG_BYTES;
+ return fmix32(hash);
}
/**
- * Generates 32 bit hash from byte array with the default seed.
+ * Generates 32-bit hash from the byte array with a default seed.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * int hash = hash32(data, 0, data.length, seed);
+ * </pre>
*
- * @param data - input byte array
- * @return 32 bit hash
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @param data The input byte array
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int, int)
*/
public static int hash32(final byte[] data) {
return hash32(data, 0, data.length, DEFAULT_SEED);
}
/**
- * Generates 32 bit hash from a string with the default seed.
+ * Generates 32-bit hash from a string with a default seed.
+ * The string is converted to bytes using the default encoding.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * byte[] bytes = data.getBytes();
+ * int hash = hash32(bytes, 0, bytes.length, seed);
+ * </pre>
*
- * @param data - input string
- * @return 32 bit hash
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @param data The input string
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int, int)
*/
public static int hash32(final String data) {
- final byte[] origin = data.getBytes();
- return hash32(origin, 0, origin.length, DEFAULT_SEED);
+ final byte[] bytes = data.getBytes();
+ return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
}
/**
- * Generates 32 bit hash from byte array with the default seed.
+ * Generates 32-bit hash from the byte array with the given length and a default seed.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * int hash = hash32(data, 0, data.length, seed);
+ * </pre>
*
- * @param data - input byte array
- * @param length - length of array
- * @return 32 bit hash
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @param data The input byte array
+ * @param length The length of array
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int, int)
*/
public static int hash32(final byte[] data, final int length) {
return hash32(data, length, DEFAULT_SEED);
}
/**
- * Generates 32 bit hash from byte array with the given length and seed.
+ * Generates 32-bit hash from the byte array with the given length and seed. This is a
+ * helper method that will produce the same result as:
+ *
+ * <pre>
+ * int hash = hash32(data, 0, data.length, seed);
+ * </pre>
*
- * @param data - input byte array
- * @param length - length of array
- * @param seed - seed. (default 0)
- * @return 32 bit hash
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @param data The input byte array
+ * @param length The length of array
+ * @param seed The initial seed value
+ * @return The 32-bit hash
+ * @see #hash32(byte[], int, int, int)
*/
public static int hash32(final byte[] data, final int length, final int seed) {
return hash32(data, 0, length, seed);
}
/**
- * Generates 32 bit hash from byte array with the given length, offset and seed.
+ * Generates 32-bit hash from the byte array with the given offset, length and seed.
+ *
+ * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+ * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
*
- * @param data - input byte array
- * @param offset - offset of data
- * @param length - length of array
- * @param seed - seed. (default 0)
- * @return 32 bit hash
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @param data The input byte array
+ * @param offset The offset of data
+ * @param length The length of array
+ * @param seed The initial seed value
+ * @return The 32-bit hash
*/
public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
int hash = seed;
@@ -197,23 +312,24 @@ public final class MurmurHash3 {
// body
for (int i = 0; i < nblocks; i++) {
- final int i_4 = i << 2;
- final int k = (data[offset + i_4] & 0xff) | ((data[offset + i_4 + 1] & 0xff) << 8)
- | ((data[offset + i_4 + 2] & 0xff) << 16) | ((data[offset + i_4 + 3] & 0xff) << 24);
-
+ final int index = offset + (i << 2);
+ final int k = getLittleEndianInt(data, index);
hash = mix32(k, hash);
}
// tail
- final int idx = nblocks << 2;
+ // ************
+ // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
+ // ************
+ final int index = offset + (nblocks << 2);
int k1 = 0;
- switch (length - idx) {
+ switch (offset + length - index) {
case 3:
- k1 ^= data[offset + idx + 2] << 16;
+ k1 ^= data[index + 2] << 16;
case 2:
- k1 ^= data[offset + idx + 1] << 8;
+ k1 ^= data[index + 1] << 8;
case 1:
- k1 ^= data[offset + idx];
+ k1 ^= data[index];
// mix functions
k1 *= C1_32;
@@ -222,26 +338,29 @@ public final class MurmurHash3 {
hash ^= k1;
}
- return fmix32(length, hash);
+ hash ^= length;
+ return fmix32(hash);
}
/**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit
- * variant.
+ * Generates 64-bit hash from a long with a default seed.
*
- * @param data - input byte array
- * @return 64 bit hash
- */
- public static long hash64(final byte[] data) {
- return hash64(data, 0, data.length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit
- * variant.
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This is a helper method that will produce the same result as:</p>
*
- * @param data - input long
- * @return 64 bit hash
+ * <pre>
+ * int seed = 104729;
+ * long hash = hash64(ByteBuffer.allocate(8)
+ * .putLong(data)
+ * .array(), 0, 8, seed);
+ * <pre>
+ *
+ * @param data The long to hash
+ * @return The 64-bit hash
+ * @see #hash64(byte[], int, int, int)
*/
public static long hash64(final long data) {
long hash = DEFAULT_SEED;
@@ -260,11 +379,24 @@ public final class MurmurHash3 {
}
/**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit
- * variant.
+ * Generates 64-bit hash from an int with a default seed.
+ *
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This is a helper method that will produce the same result as:</p>
+ *
+ * <pre>
+ * int seed = 104729;
+ * long hash = hash64(ByteBuffer.allocate(4)
+ * .putInt(data)
+ * .array(), 0, 4, seed);
+ * <pre>
*
- * @param data - input int
- * @return 64 bit hash
+ * @param data The int to hash
+ * @return The 64-bit hash
+ * @see #hash64(byte[], int, int, int)
*/
public static long hash64(final int data) {
long k1 = Integer.reverseBytes(data) & (-1L >>> 32);
@@ -281,11 +413,24 @@ public final class MurmurHash3 {
}
/**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit
- * variant.
+ * Generates 64-bit hash from a short with a default seed.
+ *
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This is a helper method that will produce the same result as:</p>
+ *
+ * <pre>
+ * int seed = 104729;
+ * long hash = hash64(ByteBuffer.allocate(2)
+ * .putShort(data)
+ * .array(), 0, 2, seed);
+ * <pre>
*
- * @param data - input short
- * @return 64 bit hash
+ * @param data The short to hash
+ * @return The 64-bit hash
+ * @see #hash64(byte[], int, int, int)
*/
public static long hash64(final short data) {
long hash = DEFAULT_SEED;
@@ -304,38 +449,84 @@ public final class MurmurHash3 {
}
/**
- * Generates 64 bit hash from byte array with the given length, offset and
- * default seed.
+ * Generates 64-bit hash from a byte array with a default seed.
*
- * @param data - input byte array
- * @param offset - offset of data
- * @param length - length of array
- * @return 64 bit hash
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This is a helper method that will produce the same result as:</p>
+ *
+ * <pre>
+ * int seed = 104729;
+ * long hash = hash64(data, 0, data.length, seed);
+ * <pre>
+ *
+ * @param data The input byte array
+ * @return The 64-bit hash
+ * @see #hash64(byte[], int, int, int)
+ */
+ public static long hash64(final byte[] data) {
+ return hash64(data, 0, data.length, DEFAULT_SEED);
+ }
+
+ /**
+ * Generates 64-bit hash from a byte array with the given offset and length and a default seed.
+ *
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This is a helper method that will produce the same result as:</p>
+ *
+ * <pre>
+ * int seed = 104729;
+ * long hash = hash64(data, offset, length, seed);
+ * <pre>
+ *
+ * @param data The input byte array
+ * @param offset The offset of data
+ * @param length The length of array
+ * @return The 64-bit hash
+ * @see #hash64(byte[], int, int, int)
*/
public static long hash64(final byte[] data, final int offset, final int length) {
return hash64(data, offset, length, DEFAULT_SEED);
}
/**
- * Generates 64 bit hash from byte array with the given length, offset and seed.
+ * Generates 64-bit hash from a byte array with the given offset, length and seed.
+ *
+ * <p>This is a Murmur3-like 64-bit variant.
+ * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong>
+ * It may be removed in a future release.</p>
+ *
+ * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks
+ * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash
+ * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return
+ * the same value as the first or second 64-bits of the function
+ * {@link #hash128(byte[], int, int, int)}.<p>
*
- * @param data - input byte array
- * @param offset - offset of data
- * @param length - length of array
- * @param seed - seed. (default 0)
- * @return 64 bit hash
+ * <p>Use of this method is not advised. Use the first long returned from
+ * {@link #hash128(byte[], int, int, int)}.<p>
+ *
+ * @param data The input byte array
+ * @param offset The offset of data
+ * @param length The length of array
+ * @param seed The initial seed value
+ * @return The 64-bit hash
*/
public static long hash64(final byte[] data, final int offset, final int length, final int seed) {
+ // ************
+ // Note: This fails to apply masking using 0xffffffffL to the seed.
+ // ************
long hash = seed;
final int nblocks = length >> 3;
// body
for (int i = 0; i < nblocks; i++) {
- final int i8 = i << 3;
- long k = ((long) data[offset + i8] & 0xff) | (((long) data[offset + i8 + 1] & 0xff) << 8)
- | (((long) data[offset + i8 + 2] & 0xff) << 16) | (((long) data[offset + i8 + 3] & 0xff) << 24)
- | (((long) data[offset + i8 + 4] & 0xff) << 32) | (((long) data[offset + i8 + 5] & 0xff) << 40)
- | (((long) data[offset + i8 + 6] & 0xff) << 48) | (((long) data[offset + i8 + 7] & 0xff) << 56);
+ final int index = offset + (i << 3);
+ long k = getLittleEndianLong(data, index);
// mix functions
k *= C1;
@@ -347,22 +538,22 @@ public final class MurmurHash3 {
// tail
long k1 = 0;
- final int tailStart = nblocks << 3;
- switch (length - tailStart) {
+ final int index = offset + (nblocks << 3);
+ switch (offset + length - index) {
case 7:
- k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+ k1 ^= ((long) data[index + 6] & 0xff) << 48;
case 6:
- k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+ k1 ^= ((long) data[index + 5] & 0xff) << 40;
case 5:
- k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+ k1 ^= ((long) data[index + 4] & 0xff) << 32;
case 4:
- k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+ k1 ^= ((long) data[index + 3] & 0xff) << 24;
case 3:
- k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+ k1 ^= ((long) data[index + 2] & 0xff) << 16;
case 2:
- k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+ k1 ^= ((long) data[index + 1] & 0xff) << 8;
case 1:
- k1 ^= ((long) data[offset + tailStart] & 0xff);
+ k1 ^= ((long) data[index] & 0xff);
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
@@ -377,52 +568,76 @@ public final class MurmurHash3 {
}
/**
- * Murmur3 128-bit variant.
+ * Generates 128-bit hash from the byte array with a default seed.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * int hash = hash128(data, 0, data.length, seed);
+ * </pre>
*
- * @param data - input byte array
- * @return - 128 bit hash (2 longs)
+ * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
+ * this result as the default seed is positive.<p>
+ *
+ * @param data The input byte array
+ * @return The 128-bit hash (2 longs)
+ * @see #hash128(byte[], int, int, int)
*/
public static long[] hash128(final byte[] data) {
return hash128(data, 0, data.length, DEFAULT_SEED);
}
/**
- * Murmur3 128-bit variant.
+ * Generates 128-bit hash from a string with a default seed.
+ * The string is converted to bytes using the default encoding.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int seed = 104729;
+ * byte[] bytes = data.getBytes();
+ * int hash = hash128(bytes, 0, bytes.length, seed);
+ * </pre>
*
- * @param data - input String
- * @return - 128 bit hash (2 longs)
+ * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
+ * this result as the default seed is positive.<p>
+ *
+ * @param data The input String
+ * @return The 128-bit hash (2 longs)
+ * @see #hash128(byte[], int, int, int)
*/
public static long[] hash128(final String data) {
- final byte[] origin = data.getBytes();
- return hash128(origin, 0, origin.length, DEFAULT_SEED);
+ final byte[] bytes = data.getBytes();
+ return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
}
/**
- * Murmur3 128-bit variant.
+ * Generates 128-bit hash from the byte array with the given offset, length and seed.
+ *
+ * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
+ * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
*
- * @param data - input byte array
- * @param offset - the first element of array
- * @param length - length of array
- * @param seed - seed. (default is 0)
- * @return - 128 bit hash (2 longs)
+ * <p>This implementation contains a sign-extension bug in the seed initialisation.
+ * This manifests if the seed is negative.<p>
+ *
+ * @param data The input byte array
+ * @param offset The first element of array
+ * @param length The length of array
+ * @param seed The initial seed value
+ * @return The 128-bit hash (2 longs)
*/
public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
+ // ************
+ // Note: This fails to apply masking using 0xffffffffL to the seed.
+ // ************
long h1 = seed;
long h2 = seed;
final int nblocks = length >> 4;
// body
for (int i = 0; i < nblocks; i++) {
- final int i16 = i << 4;
- long k1 = ((long) data[offset + i16] & 0xff) | (((long) data[offset + i16 + 1] & 0xff) << 8)
- | (((long) data[offset + i16 + 2] & 0xff) << 16) | (((long) data[offset + i16 + 3] & 0xff) << 24)
- | (((long) data[offset + i16 + 4] & 0xff) << 32) | (((long) data[offset + i16 + 5] & 0xff) << 40)
- | (((long) data[offset + i16 + 6] & 0xff) << 48) | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
- long k2 = ((long) data[offset + i16 + 8] & 0xff) | (((long) data[offset + i16 + 9] & 0xff) << 8)
- | (((long) data[offset + i16 + 10] & 0xff) << 16) | (((long) data[offset + i16 + 11] & 0xff) << 24)
- | (((long) data[offset + i16 + 12] & 0xff) << 32) | (((long) data[offset + i16 + 13] & 0xff) << 40)
- | (((long) data[offset + i16 + 14] & 0xff) << 48) | (((long) data[offset + i16 + 15] & 0xff) << 56);
+ final int index = offset + (i << 4);
+ long k1 = getLittleEndianLong(data, index);
+ long k2 = getLittleEndianLong(data, index + 8);
// mix functions for k1
k1 *= C1;
@@ -446,43 +661,43 @@ public final class MurmurHash3 {
// tail
long k1 = 0;
long k2 = 0;
- final int tailStart = nblocks << 4;
- switch (length - tailStart) {
+ final int index = offset + (nblocks << 4);
+ switch (offset + length - index) {
case 15:
- k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+ k2 ^= ((long) data[index + 14] & 0xff) << 48;
case 14:
- k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+ k2 ^= ((long) data[index + 13] & 0xff) << 40;
case 13:
- k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+ k2 ^= ((long) data[index + 12] & 0xff) << 32;
case 12:
- k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+ k2 ^= ((long) data[index + 11] & 0xff) << 24;
case 11:
- k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+ k2 ^= ((long) data[index + 10] & 0xff) << 16;
case 10:
- k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+ k2 ^= ((long) data[index + 9] & 0xff) << 8;
case 9:
- k2 ^= data[offset + tailStart + 8] & 0xff;
+ k2 ^= data[index + 8] & 0xff;
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
case 8:
- k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+ k1 ^= ((long) data[index + 7] & 0xff) << 56;
case 7:
- k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+ k1 ^= ((long) data[index + 6] & 0xff) << 48;
case 6:
- k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+ k1 ^= ((long) data[index + 5] & 0xff) << 40;
case 5:
- k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+ k1 ^= ((long) data[index + 4] & 0xff) << 32;
case 4:
- k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+ k1 ^= ((long) data[index + 3] & 0xff) << 24;
case 3:
- k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+ k1 ^= ((long) data[index + 2] & 0xff) << 16;
case 2:
- k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+ k1 ^= ((long) data[index + 1] & 0xff) << 8;
case 1:
- k1 ^= data[offset + tailStart] & 0xff;
+ k1 ^= data[index] & 0xff;
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
@@ -505,6 +720,46 @@ public final class MurmurHash3 {
return new long[] { h1, h2 };
}
+
+ /**
+ * Gets the little-endian long from 8 bytes starting at the specified index.
+ *
+ * @param data The data
+ * @param index The index
+ * @return The little-endian long
+ */
+ private static long getLittleEndianLong(final byte[] data, final int index) {
+ return (((long) data[index ] & 0xff) ) |
+ (((long) data[index + 1] & 0xff) << 8) |
+ (((long) data[index + 2] & 0xff) << 16) |
+ (((long) data[index + 3] & 0xff) << 24) |
+ (((long) data[index + 4] & 0xff) << 32) |
+ (((long) data[index + 5] & 0xff) << 40) |
+ (((long) data[index + 6] & 0xff) << 48) |
+ (((long) data[index + 7] & 0xff) << 56);
+ }
+
+ /**
+ * Gets the little-endian int from 4 bytes starting at the specified index.
+ *
+ * @param data The data
+ * @param index The index
+ * @return The little-endian int
+ */
+ private static int getLittleEndianInt(final byte[] data, final int index) {
+ return ((data[index ] & 0xff) ) |
+ ((data[index + 1] & 0xff) << 8) |
+ ((data[index + 2] & 0xff) << 16) |
+ ((data[index + 3] & 0xff) << 24);
+ }
+
+ /**
+ * Perform the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
+ *
+ * @param k The data to add to the hash
+ * @param hash The current hash
+ * @return The new hash
+ */
private static int mix32(int k, int hash) {
k *= C1_32;
k = Integer.rotateLeft(k, R1_32);
@@ -513,104 +768,172 @@ public final class MurmurHash3 {
return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
}
- private static int fmix32(final int length, int hash) {
- hash ^= length;
+ /**
+ * Perform the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
+ *
+ * @param hash The current hash
+ * @return The final hash
+ */
+ private static int fmix32(int hash) {
hash ^= (hash >>> 16);
hash *= 0x85ebca6b;
hash ^= (hash >>> 13);
hash *= 0xc2b2ae35;
hash ^= (hash >>> 16);
-
return hash;
}
- private static long fmix64(long h) {
- h ^= (h >>> 33);
- h *= 0xff51afd7ed558ccdL;
- h ^= (h >>> 33);
- h *= 0xc4ceb9fe1a85ec53L;
- h ^= (h >>> 33);
- return h;
+ /**
+ * Perform the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
+ *
+ * @param hash The current hash
+ * @return The final hash
+ */
+ private static long fmix64(long hash) {
+ hash ^= (hash >>> 33);
+ hash *= 0xff51afd7ed558ccdL;
+ hash ^= (hash >>> 33);
+ hash *= 0xc4ceb9fe1a85ec53L;
+ hash ^= (hash >>> 33);
+ return hash;
}
+ /**
+ * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
+ * hash computed.
+ *
+ * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+ * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+ *
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ */
public static class IncrementalHash32 {
- byte[] tail = new byte[3];
- int tailLen;
- int totalLen;
- int hash;
-
- public final void start(final int hash) {
- tailLen = totalLen = 0;
- this.hash = hash;
+ /** The size of byte blocks that are processed together. */
+ private static final int BLOCK_SIZE = 4;
+
+ /** Up to 3 unprocessed bytes from input data. */
+ private final byte[] unprocessed = new byte[3];
+ /** The number of unprocessed bytes in the tail data. */
+ private int unprocessedLength;
+ /** The total number of input bytes added since the start. */
+ private int totalLen;
+ /**
+ * The current running hash.
+ * This must be finalised to generate the 32-bit hash value.
+ */
+ private int hash;
+
+ /**
+ * Start a new incremental hash.
+ *
+ * @param seed The initial seed value
+ */
+ public final void start(final int seed) {
+ // Reset
+ unprocessedLength = totalLen = 0;
+ this.hash = seed;
}
- public final void add(final byte[] data, int offset, final int length) {
+ /**
+ * Adds the byte array to the current incremental hash.
+ *
+ * @param data The input byte array
+ * @param offset The offset of data
+ * @param length The length of array
+ */
+ public final void add(final byte[] data, final int offset, final int length) {
if (length == 0) {
+ // Nothing to add
return;
}
totalLen += length;
- if (tailLen + length < 4) {
- System.arraycopy(data, offset, tail, tailLen, length);
- tailLen += length;
+
+ // Process the bytes in blocks of 4.
+ // New bytes must be added to any current unprocessed bytes,
+ // then processed in blocks of 4 and the remaining bytes saved:
+ //
+ // |--|---------------------------|--|
+ // unprocessed
+ // main block
+ // remaining
+
+ // Check if the unprocessed bytes and new bytes can fill a block of 4
+ if (unprocessedLength + length < BLOCK_SIZE) {
+ // Not enough so add to the unprocessed bytes
+ System.arraycopy(data, offset, unprocessed, unprocessedLength, length);
+ unprocessedLength += length;
return;
}
- int offset2 = 0;
- if (tailLen > 0) {
- offset2 = (4 - tailLen);
+
+ // Combine unprocessed bytes with new bytes.
+ int newOffset;
+ int newLength;
+ if (unprocessedLength > 0) {
int k = -1;
- switch (tailLen) {
+ switch (unprocessedLength) {
case 1:
- k = orBytes(tail[0], data[offset], data[offset + 1], data[offset + 2]);
+ k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]);
break;
case 2:
- k = orBytes(tail[0], tail[1], data[offset], data[offset + 1]);
+ k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]);
break;
case 3:
- k = orBytes(tail[0], tail[1], tail[2], data[offset]);
+ k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]);
break;
default:
- throw new AssertionError(tailLen);
+ throw new AssertionError("Unprocessed length should be 1, 2, or 3: " + unprocessedLength);
}
- // mix functions
- k *= C1_32;
- k = Integer.rotateLeft(k, R1_32);
- k *= C2_32;
- hash ^= k;
- hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+ hash = mix32(k, hash);
+ // Update the offset and length
+ final int consumed = BLOCK_SIZE - unprocessedLength;
+ newOffset = offset + consumed;
+ newLength = length - consumed;
+ } else {
+ newOffset = offset;
+ newLength = length;
}
- final int length2 = length - offset2;
- offset += offset2;
- final int nblocks = length2 >> 2;
- for (int i = 0; i < nblocks; i++) {
- final int i_4 = (i << 2) + offset;
- int k = orBytes(data[i_4], data[i_4 + 1], data[i_4 + 2], data[i_4 + 3]);
+ // Main processing of blocks of 4 bytes
+ final int nblocks = newLength >> 2;
- // mix functions
- k *= C1_32;
- k = Integer.rotateLeft(k, R1_32);
- k *= C2_32;
- hash ^= k;
- hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+ for (int i = 0; i < nblocks; i++) {
+ final int index = newOffset + (i << 2);
+ final int k = getLittleEndianInt(data, index);
+ hash = mix32(k, hash);
}
+ // Save left-over unprocessed bytes
final int consumed = (nblocks << 2);
- tailLen = length2 - consumed;
- if (consumed == length2) {
- return;
+ unprocessedLength = newLength - consumed;
+ if (unprocessedLength != 0) {
+ System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength);
}
- System.arraycopy(data, offset + consumed, tail, 0, tailLen);
}
+ /**
+ * Generate the 32-bit hash value. Repeat calls to this method with no additional data
+ * will generate the same hash value.
+ *
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @return The 32-bit hash
+ */
public final int end() {
+ // ************
+ // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
+ // ************
int k1 = 0;
- switch (tailLen) {
+ switch (unprocessedLength) {
case 3:
- k1 ^= tail[2] << 16;
+ k1 ^= unprocessed[2] << 16;
case 2:
- k1 ^= tail[1] << 8;
+ k1 ^= unprocessed[1] << 8;
case 1:
- k1 ^= tail[0];
+ k1 ^= unprocessed[0];
// mix functions
k1 *= C1_32;
@@ -621,16 +944,22 @@ public final class MurmurHash3 {
// finalization
hash ^= totalLen;
- hash ^= (hash >>> 16);
- hash *= 0x85ebca6b;
- hash ^= (hash >>> 13);
- hash *= 0xc2b2ae35;
- hash ^= (hash >>> 16);
- return hash;
+ return fmix32(hash);
}
- }
- private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
- return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
+ /**
+ * Combine the bytes using an Or operation ({@code | } in a little-endian representation
+ * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most
+ * significant.
+ *
+ * @param b1 The first byte
+ * @param b2 The second byte
+ * @param b3 The third byte
+ * @param b4 The fourth byte
+ * @return The 32-bit integer
+ */
+ private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
+ return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
+ }
}
}