You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/11/21 15:54:41 UTC
[commons-codec] 04/07: [CODEC-267] Add hash32x86 methods to fix
sign extension error.
This is an automated email from the ASF dual-hosted git repository.
aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit 1645ab953ce98bffaff2424cd0b3d2501e21b04f
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 15:13:42 2019 +0000
[CODEC-267] Add hash32x86 methods to fix sign extension error.
Adds a new API to preserve behavioural compatibility with the released
version:
hash32x86(byte[])
hash32x86(byte[], int offset, int length, int seed)
IncrementalHash32x64
The methods with the sign extension bug have been deprecated.
The new API methods use zero as the default seed.
mvn clirr:check is OK with adding a new class IncrementalHash32x64 as
the parent of IncrementalHash32.
---
src/changes/changes.xml | 3 +-
.../apache/commons/codec/digest/MurmurHash3.java | 159 +++++++++++++++++++--
.../commons/codec/digest/MurmurHash3Test.java | 122 +++++++++++++++-
3 files changed, 267 insertions(+), 17 deletions(-)
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 1f3a2d1..74fed85 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -43,7 +43,8 @@ The <action> type attribute can be add,update,fix,remove.
<body>
<release version="1.14" date="TBD" description="Feature and fix release.">
- <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to IncrementalHash32.end() to generate the same value.</action>
+ <action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action>
+ <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action>
</release>
<release version="1.13" date="2019-07-20" description="Feature and fix release.">
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index c7a149e..8b28b64 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -217,7 +217,9 @@ public final class MurmurHash3 {
* @param data The input byte array
* @return The 32-bit hash
* @see #hash32(byte[], int, int, int)
+ * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
*/
+ @Deprecated
public static int hash32(final byte[] data) {
return hash32(data, 0, data.length, DEFAULT_SEED);
}
@@ -240,7 +242,9 @@ public final class MurmurHash3 {
* @param data The input string
* @return The 32-bit hash
* @see #hash32(byte[], int, int, int)
+ * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
*/
+ @Deprecated
public static int hash32(final String data) {
final byte[] bytes = data.getBytes();
return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
@@ -263,7 +267,9 @@ public final class MurmurHash3 {
* @param length The length of array
* @return The 32-bit hash
* @see #hash32(byte[], int, int, int)
+ * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
*/
+ @Deprecated
public static int hash32(final byte[] data, final int length) {
return hash32(data, length, DEFAULT_SEED);
}
@@ -285,7 +291,9 @@ public final class MurmurHash3 {
* @param seed The initial seed value
* @return The 32-bit hash
* @see #hash32(byte[], int, int, int)
+ * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
*/
+ @Deprecated
public static int hash32(final byte[] data, final int length, final int seed) {
return hash32(data, 0, length, seed);
}
@@ -305,7 +313,9 @@ public final class MurmurHash3 {
* @param length The length of array
* @param seed The initial seed value
* @return The 32-bit hash
+ * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
*/
+ @Deprecated
public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
int hash = seed;
final int nblocks = length >> 2;
@@ -343,6 +353,69 @@ public final class MurmurHash3 {
}
/**
+ * Generates 32-bit hash from the byte array with a seed of zero.
+ * This is a helper method that will produce the same result as:
+ *
+ * <pre>
+ * int offset = 0;
+ * int seed = 0;
+ * int hash = hash32x86(data, offset, data.length, seed);
+ * </pre>
+ *
+ * @param data The input byte array
+ * @return The 32-bit hash
+ * @see #hash32x86(byte[], int, int, int)
+ */
+ public static int hash32x86(final byte[] data) {
+ return hash32x86(data, 0, data.length, 0);
+ }
+
+ /**
+ * Generates 32-bit hash from the byte array with the given offset, length and seed.
+ *
+ * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+ * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+ *
+ * @param data The input byte array
+ * @param offset The offset of data
+ * @param length The length of array
+ * @param seed The initial seed value
+ * @return The 32-bit hash
+ */
+ public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
+ int hash = seed;
+ final int nblocks = length >> 2;
+
+ // body
+ for (int i = 0; i < nblocks; i++) {
+ final int index = offset + (i << 2);
+ final int k = getLittleEndianInt(data, index);
+ hash = mix32(k, hash);
+ }
+
+ // tail
+ final int index = offset + (nblocks << 2);
+ int k1 = 0;
+ switch (offset + length - index) {
+ case 3:
+ k1 ^= (data[index + 2] & 0xff) << 16;
+ case 2:
+ k1 ^= (data[index + 1] & 0xff) << 8;
+ case 1:
+ k1 ^= (data[index] & 0xff);
+
+ // mix functions
+ k1 *= C1_32;
+ k1 = Integer.rotateLeft(k1, R1_32);
+ k1 *= C2_32;
+ hash ^= k1;
+ }
+
+ hash ^= length;
+ return fmix32(hash);
+ }
+
+ /**
* Generates 64-bit hash from a long with a default seed.
*
* <p>This is a Murmur3-like 64-bit variant.
@@ -804,12 +877,8 @@ public final class MurmurHash3 {
*
* <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
* from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
- *
- * <p>This implementation contains a sign-extension bug in the finalisation step of
- * any bytes left over from dividing the length by 4. This manifests if any of these
- * bytes are negative.<p>
*/
- public static class IncrementalHash32 {
+ public static class IncrementalHash32x86 {
/** The size of byte blocks that are processed together. */
private static final int BLOCK_SIZE = 4;
@@ -916,26 +985,33 @@ public final class MurmurHash3 {
* Generate the 32-bit hash value. Repeat calls to this method with no additional data
* will generate the same hash value.
*
- * <p>This implementation contains a sign-extension bug in the finalisation step of
- * any bytes left over from dividing the length by 4. This manifests if any of these
- * bytes are negative.<p>
- *
* @return The 32-bit hash
*/
public final int end() {
// Allow calling end() again after adding no data to return the same result.
+ return finalise(hash, unprocessedLength, unprocessed, totalLen);
+ }
+
+ /**
+ * Finalise the running hash to the output 32-bit hash by processing remaining bytes
+ * and performing final mixing.
+ *
+ * @param hash The running hash
+ * @param unprocessedLength The number of unprocessed bytes in the tail data.
+ * @param unprocessed Up to 3 unprocessed bytes from input data.
+ * @param totalLen The total number of input bytes added since the start.
+ * @return The 32-bit hash
+ */
+ int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
int result = hash;
- // ************
- // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
- // ************
int k1 = 0;
switch (unprocessedLength) {
case 3:
- k1 ^= unprocessed[2] << 16;
+ k1 ^= (unprocessed[2] & 0xff) << 16;
case 2:
- k1 ^= unprocessed[1] << 8;
+ k1 ^= (unprocessed[1] & 0xff) << 8;
case 1:
- k1 ^= unprocessed[0];
+ k1 ^= (unprocessed[0] & 0xff);
// mix functions
k1 *= C1_32;
@@ -964,4 +1040,57 @@ public final class MurmurHash3 {
return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
}
}
+
+ /**
+ * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
+ * hash computed.
+ *
+ * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+ * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+ *
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
+ */
+ @Deprecated
+ public static class IncrementalHash32 extends IncrementalHash32x86 {
+ /**
+ * {@inheritDoc}
+ *
+ * <p>This implementation contains a sign-extension bug in the finalisation step of
+ * any bytes left over from dividing the length by 4. This manifests if any of these
+ * bytes are negative.<p>
+ *
+ * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
+ */
+ @Override
+ @Deprecated
+ int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
+ int result = hash;
+ // ************
+ // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
+ // ************
+ int k1 = 0;
+ switch (unprocessedLength) {
+ case 3:
+ k1 ^= unprocessed[2] << 16;
+ case 2:
+ k1 ^= unprocessed[1] << 8;
+ case 1:
+ k1 ^= unprocessed[0];
+
+ // mix functions
+ k1 *= C1_32;
+ k1 = Integer.rotateLeft(k1, R1_32);
+ k1 *= C2_32;
+ result ^= k1;
+ }
+
+ // finalization
+ result ^= totalLen;
+ return fmix32(result);
+ }
+ }
}
diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
index d7c21aa..f975ca8 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32;
+import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32x86;
import org.junit.Test;
/**
@@ -349,7 +350,7 @@ public class MurmurHash3Test {
* if the final 1, 2, or 3 bytes are negative.
*/
@Test
- public void testHash32With1TrailingSignedByteIsInvalid() {
+ public void testHash32WithTrailingNegativeSignedBytesIsInvalid() {
// import mmh3
// import numpy as np
// mmh3.hash(np.uint8([-1]))
@@ -367,6 +368,71 @@ public class MurmurHash3Test {
}
/**
+ * Test the {@link MurmurHash3#hash32x86(byte[])} algorithm.
+ *
+ * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+ *
+ * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+ */
+ @Test
+ public void testhash32x86() {
+ // Note: Default seed is zero.
+
+ // mmh3.hash(bytes, 0)
+ Assert.assertEquals(1546271276, MurmurHash3.hash32x86(RANDOM_BYTES));
+
+ // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+ // 15 bytes remaining.
+ // for x in range(0, 32):
+ // print(mmh3.hash(bytes[:x], 0), ',')
+ final int[] answers = {0, -1353253853, 915381745, -734983419, 1271125654, -1042265893, -1204521619, 735845843,
+ 138310876, -1918938664, 1399647898, -1126342309, 2067593280, 1220975287, 1941281084, -1289513180, 942412060,
+ -618173583, -269546647, -1645631262, 1162379906, -1960125577, -1856773195, 1980513522, 1174612855,
+ 905810751, 1044578220, -1758486689, -491393913, 839836946, -435014415, 2044851178,};
+ for (int i = 0; i < answers.length; i++) {
+ final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
+ Assert.assertEquals(answers[i], MurmurHash3.hash32x86(bytes));
+ }
+ }
+
+ /**
+ * Test the {@link MurmurHash3#hash32x86(byte[], int, int, int)} algorithm.
+ *
+ * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+ *
+ * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+ */
+ @Test
+ public void testHash32x86WithOffsetLengthAndSeed() {
+ // Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
+ final int seed = -42;
+ final int offset = 13;
+ final int[] answers = {192929823, -27171978, -1282326280, -816314453, -1176217753, -1904531247, 1962794233,
+ -1302316624, -1151850323, -1464386748, -369299427, 972232488, 1747314487, 2137398916, 690986564,
+ -1985866226, -678669121, -2123325690, -253319081, 46181235, 656058278, 1401175653, 1750113912, -1567219725,
+ 2032742772, -2024269989, -305340794, 1161737942, -661265418, 172838872, -650122718, -1934812417,};
+ for (int i = 0; i < answers.length; i++) {
+ Assert.assertEquals(answers[i], MurmurHash3.hash32x86(RANDOM_BYTES, offset, i, seed));
+ }
+ }
+
+ /**
+ * Test to demonstrate {@link MurmurHash3#hash32x86(byte[], int, int, int)} is OK
+ * if the final 1, 2, or 3 bytes are negative.
+ */
+ @Test
+ public void testHash32x86WithTrailingNegativeSignedBytes() {
+ // Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
+ // This test uses assertEquals().
+ Assert.assertEquals(-43192051, MurmurHash3.hash32x86(new byte[] {-1}, 0, 1, 0));
+ Assert.assertEquals(-582037868, MurmurHash3.hash32x86(new byte[] {0, -1}, 0, 2, 0));
+ Assert.assertEquals(922088087, MurmurHash3.hash32x86(new byte[] {0, 0, -1}, 0, 3, 0));
+ Assert.assertEquals(-1309567588, MurmurHash3.hash32x86(new byte[] {-1, 0}, 0, 2, 0));
+ Assert.assertEquals(-363779670, MurmurHash3.hash32x86(new byte[] {-1, 0, 0}, 0, 3, 0));
+ Assert.assertEquals(-225068062, MurmurHash3.hash32x86(new byte[] {0, -1, 0}, 0, 3, 0));
+ }
+
+ /**
* Test the {@link MurmurHash3#hash64(byte[])} algorithm.
* Unknown origin of test data. It may be from the Apache Hive project.
*/
@@ -600,4 +666,58 @@ public class MurmurHash3Test {
Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
}
}
+
+ /**
+ * Test {@link IncrementalHash32x86} returns the same values as
+ * {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
+ */
+ @Test
+ public void testIncrementalHash32x86() {
+ final byte[] bytes = new byte[1023];
+ ThreadLocalRandom.current().nextBytes(bytes);
+ // The seed does not matter
+ for (final int seed : new int[] {-567, 0, 6787990}) {
+ // Cases are constructed to hit all edge cases of processing:
+ // Nothing added
+ assertIncrementalHash32x86(bytes, seed, 0, 0);
+ // Add single bytes
+ assertIncrementalHash32x86(bytes, seed, 1, 1, 1, 1, 1, 1, 1, 1);
+ // Leading unprocessed 1, 2, 3
+ assertIncrementalHash32x86(bytes, seed, 1, 4);
+ assertIncrementalHash32x86(bytes, seed, 2, 4);
+ assertIncrementalHash32x86(bytes, seed, 3, 4);
+ // Trailing unprocessed 1, 2, 3
+ assertIncrementalHash32x86(bytes, seed, 4, 1);
+ assertIncrementalHash32x86(bytes, seed, 4, 2);
+ assertIncrementalHash32x86(bytes, seed, 4, 3);
+ // Complete blocks
+ assertIncrementalHash32x86(bytes, seed, 4, 16, 64);
+ }
+ }
+
+ /**
+ * Assert {@link IncrementalHash32x86} returns the same values as
+ * {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
+ *
+ * <p>The bytes are added to the incremental hash in the given blocks.</p>
+ *
+ * @param bytes the bytes
+ * @param seed the seed
+ * @param blocks the blocks
+ */
+ private static void assertIncrementalHash32x86(byte[] bytes, int seed, int... blocks) {
+ int offset = 0;
+ int total = 0;
+ final IncrementalHash32x86 inc = new IncrementalHash32x86();
+ inc.start(seed);
+ for (final int block : blocks) {
+ total += block;
+ final int h1 = MurmurHash3.hash32x86(bytes, 0, total, seed);
+ inc.add(bytes, offset, block);
+ offset += block;
+ final int h2 = inc.end();
+ Assert.assertEquals("Hashes differ", h1, h2);
+ Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
+ }
+ }
}