You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/11/21 15:54:42 UTC

[commons-codec] 05/07: [CODEC-264] Add hash128x64 methods to fix sign extension error.

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git

commit 82b108f3df4b8db2a2ce56afce19a2b0a97c5539
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 15:32:11 2019 +0000

    [CODEC-264] Add hash128x64 methods to fix sign extension error.
    
    Adds a new API to preserve behavioural compatibility with the released
    version:
    
    hash128x64(byte[])
    hash128x64(byte[], int offset, int length, int seed)
    
    The methods with the sign extension bug have been deprecated.
    
    The new API methods use zero as the default seed.
---
 src/changes/changes.xml                            |   1 +
 .../apache/commons/codec/digest/MurmurHash3.java   |  60 +++++++++++-
 .../commons/codec/digest/MurmurHash3Test.java      | 109 +++++++++++++++++++++
 3 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 74fed85..4949fcb 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -43,6 +43,7 @@ The <action> type attribute can be add,update,fix,remove.
   <body>
 
     <release version="1.14" date="TBD" description="Feature and fix release.">
+      <action issue="CODEC-264" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash128x64 methods to fix sign extension error during seeding in hash128 methods.</action>
       <action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action>
       <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action>
     </release>
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index 8b28b64..a8ef5f3 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -554,7 +554,7 @@ public final class MurmurHash3 {
      *
      * <pre>
      * int seed = 104729;
-     * long hash = hash64(data, offset, length, seed);
+     * long hash = MurmurHash3.hash64(data, offset, length, seed);
      * <pre>
      *
      * @param data The input byte array
@@ -645,8 +645,9 @@ public final class MurmurHash3 {
      * This is a helper method that will produce the same result as:
      *
      * <pre>
+     * int offset = 0;
      * int seed = 104729;
-     * int hash = hash128(data, 0, data.length, seed);
+     * int hash = MurmurHash3.hash128(data, offset, data.length, seed);
      * </pre>
      *
      * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
@@ -661,6 +662,24 @@ public final class MurmurHash3 {
     }
 
     /**
+     * Generates 128-bit hash from the byte array with a seed of zero.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int offset = 0;
+     * int seed = 0;
+     * int hash = MurmurHash3.hash128x64(data, offset, data.length, 0);
+     * </pre>
+     *
+     * @param data The input byte array
+     * @return The 128-bit hash (2 longs)
+     * @see #hash128x64(byte[], int, int, int)
+     */
+    public static long[] hash128x64(final byte[] data) {
+        return hash128x64(data, 0, data.length, 0);
+    }
+
+    /**
      * Generates 128-bit hash from a string with a default seed.
      * The string is converted to bytes using the default encoding.
      * This is a helper method that will produce the same result as:
@@ -668,7 +687,7 @@ public final class MurmurHash3 {
      * <pre>
      * int seed = 104729;
      * byte[] bytes = data.getBytes();
-     * int hash = hash128(bytes, 0, bytes.length, seed);
+     * int hash = MurmurHash3.hash128(bytes, 0, bytes.length, seed);
      * </pre>
      *
      * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
@@ -697,11 +716,46 @@ public final class MurmurHash3 {
      * @param length The length of array
      * @param seed The initial seed value
      * @return The 128-bit hash (2 longs)
+     * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialisation.
      */
+    @Deprecated
     public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
         // ************
         // Note: This fails to apply masking using 0xffffffffL to the seed.
         // ************
+        return hash128x64(data, offset, length, seed);
+    }
+
+    /**
+     * Generates 128-bit hash from the byte array with the given offset, length and seed.
+     *
+     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+     *
+     * @param data The input byte array
+     * @param offset The first element of array
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 128-bit hash (2 longs)
+     */
+    public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
+        // Use an unsigned 32-bit integer as the seed
+        return hash128x64(data, offset, length, seed & 0xffffffffL);
+    }
+
+    /**
+     * Generates 128-bit hash from the byte array with the given offset, length and seed.
+     *
+     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
+     *
+     * @param data The input byte array
+     * @param offset The first element of array
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 128-bit hash (2 longs)
+     */
+    private static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) {
         long h1 = seed;
         long h2 = seed;
         final int nblocks = length >> 4;
diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
index f975ca8..79787f3 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
@@ -30,6 +30,7 @@ import org.junit.Test;
 /**
  * Test for {@link MurmurHash3}.
  */
+@SuppressWarnings("deprecation")
 public class MurmurHash3Test {
 
     /** Test data for the hash64 method. */
@@ -614,6 +615,114 @@ public class MurmurHash3Test {
     }
 
     /**
+     * Test the {@link MurmurHash3#hash128x64(byte[])} algorithm.
+     *
+     * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+     *
+     * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+     */
+    @Test
+    public void testHash128x64() {
+        // Note: Default seed is zero.
+
+        // mmh3.hash64(bytes, 0)
+        Assert.assertArrayEquals(new long[] {1972113670104592209L, 5171809317673151911L},
+            MurmurHash3.hash128x64(RANDOM_BYTES));
+
+        // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+        // 15 bytes remaining.
+        // for x in range(0, 32):
+        //   print(mmh3.hash64(bytes[:x], 0), ',')
+        final long[][] answers = {{0L, 0L}, {-2808653841080383123L, -2531784594030660343L},
+            {-1284575471001240306L, -8226941173794461820L}, {1645529003294647142L, 4109127559758330427L},
+            {-4117979116203940765L, -8362902660322042742L}, {2559943399590596158L, 4738005461125350075L},
+            {-1651760031591552651L, -5386079254924224461L}, {-6208043960690815609L, 7862371518025305074L},
+            {-5150023478423646337L, 8346305334874564507L}, {7658274117911906792L, -4962914659382404165L},
+            {1309458104226302269L, 570003296096149119L}, {7440169453173347487L, -3489345781066813740L},
+            {-5698784298612201352L, 3595618450161835420L}, {-3822574792738072442L, 6878153771369862041L},
+            {3705084673301918328L, 3202155281274291907L}, {-6797166743928506931L, -4447271093653551597L},
+            {5240533565589385084L, -5575481185288758327L}, {-8467620131382649428L, -6450630367251114468L},
+            {3632866961828686471L, -5957695976089313500L}, {-6450283648077271139L, -7908632714374518059L},
+            {226350826556351719L, 8225586794606475685L}, {-2382996224496980401L, 2188369078123678011L},
+            {-1337544762358780825L, 7004253486151757299L}, {2889033453638709716L, -4099509333153901374L},
+            {-8644950936809596954L, -5144522919639618331L}, {-5628571865255520773L, -839021001655132087L},
+            {-5226774667293212446L, -505255961194269502L}, {1337107025517938142L, 3260952073019398505L},
+            {9149852874328582511L, 1880188360994521535L}, {-4035957988359881846L, -7709057850766490780L},
+            {-3842593823306330815L, 3805147088291453755L}, {4030161393619149616L, -2813603781312455238L},};
+        for (int i = 0; i < answers.length; i++) {
+            final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
+            Assert.assertArrayEquals(answers[i], MurmurHash3.hash128x64(bytes));
+        }
+    }
+
+    /**
+     * Test the {@link MurmurHash3#hash128x64(byte[], int, int, int)} algorithm.
+     *
+     * <p>Reference data is taken from the Python library {@code mmh3}.</p>
+     *
+     * @see <a href="https://pypi.org/project/mmh3/">mmh3</a>
+     */
+    @Test
+    public void testHash128x64WithOffsetLengthAndSeed() {
+        // Seed can be positive
+        final int seed = 42;
+        final int offset = 13;
+
+        // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+        // 15 bytes remaining.
+        // for x in range(0, 32):
+        //   print(mmh3.hash64(bytes[13:x+13], 42), ',')
+        final long[][] answers = {{-1140915396076141277L, -3386313222241793095L},
+            {2745805417334040752L, -3045882272665292331L}, {6807939080212835946L, -1975749467247671127L},
+            {-7924884987449335214L, -4468571497642087939L}, {3005389733967167773L, -5809440073240597398L},
+            {8032745196600164727L, 4545709434702374224L}, {2095398623732573832L, 1778447136435513908L},
+            {4492723708121417255L, -7411125500882394867L}, {8467397417110552178L, -1503802302645548949L},
+            {4189760269121918355L, -8004336343217265057L}, {4939298084211301953L, -8419135013628844658L},
+            {5497136916151148085L, -394028342910298191L}, {3405983294878231737L, -3216533807498089078L},
+            {5833223403351466775L, -1792451370239813325L}, {7730583391236194819L, 5356157313842354092L},
+            {3111977482488580945L, -3119414725698132191L}, {3314524606404365027L, -1923219843083192742L},
+            {7299569240140613949L, 4176392429810027494L}, {6398084683727166117L, 7703960505857395788L},
+            {-8594572031068184774L, 4394224719145783692L}, {-7589785442804461713L, 4110439243215224554L},
+            {-5343610105946840628L, -4423992782020122809L}, {-522490326525787270L, 289136460641968781L},
+            {-5320637070354802556L, -7845553044730489027L}, {1344456408744313334L, 3803048032054968586L},
+            {1131205296221907191L, -6256656049039287019L}, {8583339267101027117L, 8934225022848628726L},
+            {-6379552869905441749L, 8973517768420051734L}, {5076646564516328801L, 8561479196844000567L},
+            {-4610341636137642517L, -6694266039505142069L}, {-758896383254029789L, 4050360662271552727L},
+            {-6123628195475753507L, 4283875822581966645L},};
+        for (int i = 0; i < answers.length; i++) {
+            Assert.assertArrayEquals("Length: " + i, answers[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed));
+        }
+
+        // Seed can be negative
+        final int seed2 = -42;
+
+        // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+        // 15 bytes remaining.
+        // for x in range(0, 32):
+        //   print(mmh3.hash64(bytes[13:x+13], -42), ',')
+        final long[][] answers2 = {{7182599573337898253L, -6490979146667806054L},
+            {-461284136738605467L, 7073284964362976233L}, {-3090354666589400212L, 2978755180788824810L},
+            {5052807367580803906L, -4497188744879598335L}, {5003711854877353474L, -6616808651483337088L},
+            {2043501804923817748L, -760668448196918637L}, {6813003268375229932L, -1818545210475363684L},
+            {4488070015393027916L, 8520186429078977003L}, {4709278711722456062L, -2262080641289046033L},
+            {-5944514262756048380L, 5968714500873552518L}, {-2304397529301122510L, 6451500469518446488L},
+            {-1054078041081348909L, -915114408909600132L}, {1300471646869277217L, -399493387666437046L},
+            {-2821780479886030222L, -9061571187511294733L}, {8005764841242557505L, 4135287855434326053L},
+            {318307346637037498L, -5355856739901286522L}, {3380719536119187025L, 1890890833937151467L},
+            {2691044185935730001L, 7963546423617895734L}, {-5277462388534000227L, 3613853764390780573L},
+            {8504421722476165699L, 2058020162708103700L}, {-6578421288092422241L, 3311200163790829579L},
+            {-5915037218487974215L, -7385137075895184179L}, {659642911937398022L, 854071824595671049L},
+            {-7007237968866727198L, 1372258010932080058L}, {622891376282772539L, -4140783491297489868L},
+            {8357110718969014985L, -4737117827581590306L}, {2208857857926305405L, -8360240839768465042L},
+            {858120048221036376L, -5822288789703639119L}, {-1988334009458340679L, 1262479472434068698L},
+            {-8580307083590783934L, 3634449965473715778L}, {6705664584730187559L, 5192304951463791556L},
+            {-6426410954037604142L, -1579122709247558101L},};
+        for (int i = 0; i < answers.length; i++) {
+            Assert.assertArrayEquals("Length: " + i, answers2[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed2));
+        }
+    }
+
+    /**
      * Test {@link IncrementalHash32} returns the same values as
      * {@link MurmurHash3#hash32(byte[], int, int, int)}.
      */