You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/11/21 15:54:41 UTC

[commons-codec] 04/07: [CODEC-267] Add hash32x86 methods to fix sign extension error.

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git

commit 1645ab953ce98bffaff2424cd0b3d2501e21b04f
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 15:13:42 2019 +0000

    [CODEC-267] Add hash32x86 methods to fix sign extension error.
    
    Adds a new API to preserve behavioural compatibility with the released
    version:
    
    hash32x86(byte[])
    hash32x86(byte[], int offset, int length, int seed)
    IncrementalHash32x64
    
    The methods with the sign extension bug have been deprecated.
    
    The new API methods use zero as the default seed.
    
    mvn clirr:check is OK with adding a new class IncrementalHash32x64 as
    the parent of IncrementalHash32.
---
 src/changes/changes.xml                            |   3 +-
 .../apache/commons/codec/digest/MurmurHash3.java   | 159 +++++++++++++++++++--
 .../commons/codec/digest/MurmurHash3Test.java      | 122 +++++++++++++++-
 3 files changed, 267 insertions(+), 17 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 1f3a2d1..74fed85 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -43,7 +43,8 @@ The <action> type attribute can be add,update,fix,remove.
   <body>
 
     <release version="1.14" date="TBD" description="Feature and fix release.">
-      <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to IncrementalHash32.end() to generate the same value.</action>
+      <action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action>
+      <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action>
     </release>
 
     <release version="1.13" date="2019-07-20" description="Feature and fix release.">
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index c7a149e..8b28b64 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -217,7 +217,9 @@ public final class MurmurHash3 {
      * @param data The input byte array
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int, int)
+     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
      */
+    @Deprecated
     public static int hash32(final byte[] data) {
         return hash32(data, 0, data.length, DEFAULT_SEED);
     }
@@ -240,7 +242,9 @@ public final class MurmurHash3 {
      * @param data The input string
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int, int)
+     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
      */
+    @Deprecated
     public static int hash32(final String data) {
         final byte[] bytes = data.getBytes();
         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
@@ -263,7 +267,9 @@ public final class MurmurHash3 {
      * @param length The length of array
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int, int)
+     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
      */
+    @Deprecated
     public static int hash32(final byte[] data, final int length) {
         return hash32(data, length, DEFAULT_SEED);
     }
@@ -285,7 +291,9 @@ public final class MurmurHash3 {
      * @param seed The initial seed value
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int, int)
+     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
      */
+    @Deprecated
     public static int hash32(final byte[] data, final int length, final int seed) {
         return hash32(data, 0, length, seed);
     }
@@ -305,7 +313,9 @@ public final class MurmurHash3 {
      * @param length The length of array
      * @param seed The initial seed value
      * @return The 32-bit hash
+     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
      */
+    @Deprecated
     public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
         int hash = seed;
         final int nblocks = length >> 2;
@@ -343,6 +353,69 @@ public final class MurmurHash3 {
     }
 
     /**
+     * Generates 32-bit hash from the byte array with a seed of zero.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int offset = 0;
+     * int seed = 0;
+     * int hash = hash32x86(data, offset, data.length, seed);
+     * </pre>
+     *
+     * @param data The input byte array
+     * @return The 32-bit hash
+     * @see #hash32x86(byte[], int, int, int)
+     */
+    public static int hash32x86(final byte[] data) {
+        return hash32x86(data, 0, data.length, 0);
+    }
+
+    /**
+     * Generates 32-bit hash from the byte array with the given offset, length and seed.
+     *
+     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+     *
+     * @param data The input byte array
+     * @param offset The offset of data
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 32-bit hash
+     */
+    public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
+        int hash = seed;
+        final int nblocks = length >> 2;
+
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = offset + (i << 2);
+            final int k = getLittleEndianInt(data, index);
+            hash = mix32(k, hash);
+        }
+
+        // tail
+        final int index = offset + (nblocks << 2);
+        int k1 = 0;
+        switch (offset + length - index) {
+        case 3:
+            k1 ^= (data[index + 2] & 0xff) << 16;
+        case 2:
+            k1 ^= (data[index + 1] & 0xff) << 8;
+        case 1:
+            k1 ^= (data[index] & 0xff);
+
+            // mix functions
+            k1 *= C1_32;
+            k1 = Integer.rotateLeft(k1, R1_32);
+            k1 *= C2_32;
+            hash ^= k1;
+        }
+
+        hash ^= length;
+        return fmix32(hash);
+    }
+
+    /**
      * Generates 64-bit hash from a long with a default seed.
      *
      * <p>This is a Murmur3-like 64-bit variant.
@@ -804,12 +877,8 @@ public final class MurmurHash3 {
      *
      * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
      * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
-     *
-     * <p>This implementation contains a sign-extension bug in the finalisation step of
-     * any bytes left over from dividing the length by 4. This manifests if any of these
-     * bytes are negative.<p>
      */
-    public static class IncrementalHash32 {
+    public static class IncrementalHash32x86 {
         /** The size of byte blocks that are processed together. */
         private static final int BLOCK_SIZE = 4;
 
@@ -916,26 +985,33 @@ public final class MurmurHash3 {
          * Generate the 32-bit hash value. Repeat calls to this method with no additional data
          * will generate the same hash value.
          *
-         * <p>This implementation contains a sign-extension bug in the finalisation step of
-         * any bytes left over from dividing the length by 4. This manifests if any of these
-         * bytes are negative.<p>
-         *
          * @return The 32-bit hash
          */
         public final int end() {
             // Allow calling end() again after adding no data to return the same result.
+            return finalise(hash, unprocessedLength, unprocessed, totalLen);
+        }
+
+        /**
+         * Finalise the running hash to the output 32-bit hash by processing remaining bytes
+         * and performing final mixing.
+         *
+         * @param hash The running hash
+         * @param unprocessedLength The number of unprocessed bytes in the tail data.
+         * @param unprocessed Up to 3 unprocessed bytes from input data.
+         * @param totalLen The total number of input bytes added since the start.
+         * @return The 32-bit hash
+         */
+        int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
             int result = hash;
-            // ************
-            // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
-            // ************
             int k1 = 0;
             switch (unprocessedLength) {
             case 3:
-                k1 ^= unprocessed[2] << 16;
+                k1 ^= (unprocessed[2] & 0xff) << 16;
             case 2:
-                k1 ^= unprocessed[1] << 8;
+                k1 ^= (unprocessed[1] & 0xff) << 8;
             case 1:
-                k1 ^= unprocessed[0];
+                k1 ^= (unprocessed[0] & 0xff);
 
                 // mix functions
                 k1 *= C1_32;
@@ -964,4 +1040,57 @@ public final class MurmurHash3 {
             return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
         }
     }
+
+    /**
+     * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
+     * hash computed.
+     *
+     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+     *
+     * <p>This implementation contains a sign-extension bug in the finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if any of these
+     * bytes are negative.<p>
+     *
+     * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
+     */
+    @Deprecated
+    public static class IncrementalHash32 extends IncrementalHash32x86 {
+        /**
+         * {@inheritDoc}
+         *
+         * <p>This implementation contains a sign-extension bug in the finalisation step of
+         * any bytes left over from dividing the length by 4. This manifests if any of these
+         * bytes are negative.<p>
+         *
+         * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
+         */
+        @Override
+        @Deprecated
+        int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) {
+            int result = hash;
+            // ************
+            // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
+            // ************
+            int k1 = 0;
+            switch (unprocessedLength) {
+            case 3:
+                k1 ^= unprocessed[2] << 16;
+            case 2:
+                k1 ^= unprocessed[1] << 8;
+            case 1:
+                k1 ^= unprocessed[0];
+
+                // mix functions
+                k1 *= C1_32;
+                k1 = Integer.rotateLeft(k1, R1_32);
+                k1 *= C2_32;
+                result ^= k1;
+            }
+
+            // finalization
+            result ^= totalLen;
+            return fmix32(result);
+        }
+    }
 }
diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
index d7c21aa..f975ca8 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
 import java.util.concurrent.ThreadLocalRandom;
 
 import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32;
+import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32x86;
 import org.junit.Test;
 
 /**
@@ -349,7 +350,7 @@ public class MurmurHash3Test {
      * if the final 1, 2, or 3 bytes are negative.
      */
     @Test
-    public void testHash32With1TrailingSignedByteIsInvalid() {
+    public void testHash32WithTrailingNegativeSignedBytesIsInvalid() {
         // import mmh3
         // import numpy as np
         // mmh3.hash(np.uint8([-1]))
@@ -367,6 +368,71 @@ public class MurmurHash3Test {
     }
 
     /**
+     * Test the {@link MurmurHash3#hash32x86(byte[])} algorithm.
+     *
+     * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+     *
+     * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+     */
+    @Test
+    public void testhash32x86() {
+        // Note: Default seed is zero.
+
+        // mmh3.hash(bytes, 0)
+        Assert.assertEquals(1546271276, MurmurHash3.hash32x86(RANDOM_BYTES));
+
+        // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+        // 15 bytes remaining.
+        // for x in range(0, 32):
+        //   print(mmh3.hash(bytes[:x], 0), ',')
+        final int[] answers = {0, -1353253853, 915381745, -734983419, 1271125654, -1042265893, -1204521619, 735845843,
+            138310876, -1918938664, 1399647898, -1126342309, 2067593280, 1220975287, 1941281084, -1289513180, 942412060,
+            -618173583, -269546647, -1645631262, 1162379906, -1960125577, -1856773195, 1980513522, 1174612855,
+            905810751, 1044578220, -1758486689, -491393913, 839836946, -435014415, 2044851178,};
+        for (int i = 0; i < answers.length; i++) {
+            final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
+            Assert.assertEquals(answers[i], MurmurHash3.hash32x86(bytes));
+        }
+    }
+
+    /**
+     * Test the {@link MurmurHash3#hash32x86(byte[], int, int, int)} algorithm.
+     *
+     * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+     *
+     * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+     */
+    @Test
+    public void testHash32x86WithOffsetLengthAndSeed() {
+        // Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
+        final int seed = -42;
+        final int offset = 13;
+        final int[] answers = {192929823, -27171978, -1282326280, -816314453, -1176217753, -1904531247, 1962794233,
+            -1302316624, -1151850323, -1464386748, -369299427, 972232488, 1747314487, 2137398916, 690986564,
+            -1985866226, -678669121, -2123325690, -253319081, 46181235, 656058278, 1401175653, 1750113912, -1567219725,
+            2032742772, -2024269989, -305340794, 1161737942, -661265418, 172838872, -650122718, -1934812417,};
+        for (int i = 0; i < answers.length; i++) {
+            Assert.assertEquals(answers[i], MurmurHash3.hash32x86(RANDOM_BYTES, offset, i, seed));
+        }
+    }
+
+    /**
+     * Test to demonstrate {@link MurmurHash3#hash32x86(byte[], int, int, int)} is OK
+     * if the final 1, 2, or 3 bytes are negative.
+     */
+    @Test
+    public void testHash32x86WithTrailingNegativeSignedBytes() {
+        // Data as above for testing MurmurHash3.hash32(byte[], int, int, int).
+        // This test uses assertEquals().
+        Assert.assertEquals(-43192051, MurmurHash3.hash32x86(new byte[] {-1}, 0, 1, 0));
+        Assert.assertEquals(-582037868, MurmurHash3.hash32x86(new byte[] {0, -1}, 0, 2, 0));
+        Assert.assertEquals(922088087, MurmurHash3.hash32x86(new byte[] {0, 0, -1}, 0, 3, 0));
+        Assert.assertEquals(-1309567588, MurmurHash3.hash32x86(new byte[] {-1, 0}, 0, 2, 0));
+        Assert.assertEquals(-363779670, MurmurHash3.hash32x86(new byte[] {-1, 0, 0}, 0, 3, 0));
+        Assert.assertEquals(-225068062, MurmurHash3.hash32x86(new byte[] {0, -1, 0}, 0, 3, 0));
+    }
+
+    /**
      * Test the {@link MurmurHash3#hash64(byte[])} algorithm.
      * Unknown origin of test data. It may be from the Apache Hive project.
      */
@@ -600,4 +666,58 @@ public class MurmurHash3Test {
             Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
         }
     }
+
+    /**
+     * Test {@link IncrementalHash32x86} returns the same values as
+     * {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
+     */
+    @Test
+    public void testIncrementalHash32x86() {
+        final byte[] bytes = new byte[1023];
+        ThreadLocalRandom.current().nextBytes(bytes);
+        // The seed does not matter
+        for (final int seed : new int[] {-567, 0, 6787990}) {
+            // Cases are constructed to hit all edge cases of processing:
+            // Nothing added
+            assertIncrementalHash32x86(bytes, seed, 0, 0);
+            // Add single bytes
+            assertIncrementalHash32x86(bytes, seed, 1, 1, 1, 1, 1, 1, 1, 1);
+            // Leading unprocessed 1, 2, 3
+            assertIncrementalHash32x86(bytes, seed, 1, 4);
+            assertIncrementalHash32x86(bytes, seed, 2, 4);
+            assertIncrementalHash32x86(bytes, seed, 3, 4);
+            // Trailing unprocessed 1, 2, 3
+            assertIncrementalHash32x86(bytes, seed, 4, 1);
+            assertIncrementalHash32x86(bytes, seed, 4, 2);
+            assertIncrementalHash32x86(bytes, seed, 4, 3);
+            // Complete blocks
+            assertIncrementalHash32x86(bytes, seed, 4, 16, 64);
+        }
+    }
+
+    /**
+     * Assert {@link IncrementalHash32x86} returns the same values as
+     * {@link MurmurHash3#hash32x86(byte[], int, int, int)}.
+     *
+     * <p>The bytes are added to the incremental hash in the given blocks.</p>
+     *
+     * @param bytes the bytes
+     * @param seed the seed
+     * @param blocks the blocks
+     */
+    private static void assertIncrementalHash32x86(byte[] bytes, int seed, int... blocks) {
+        int offset = 0;
+        int total = 0;
+        final IncrementalHash32x86 inc = new IncrementalHash32x86();
+        inc.start(seed);
+        for (final int block : blocks) {
+            total += block;
+            final int h1 = MurmurHash3.hash32x86(bytes, 0, total, seed);
+            inc.add(bytes, offset, block);
+            offset += block;
+            final int h2 = inc.end();
+            Assert.assertEquals("Hashes differ", h1, h2);
+            Assert.assertEquals("Hashes differ after no additional data", h1, inc.end());
+        }
+    }
 }