You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2020/01/18 02:00:00 UTC

[commons-codec] branch master updated: [CODEC-264]: Ensure hash128 maintains the sign extension bug.

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git


The following commit(s) were added to refs/heads/master by this push:
     new f5a61f0  [CODEC-264]: Ensure hash128 maintains the sign extension bug.
f5a61f0 is described below

commit f5a61f0cd029f18666163f414f848ba0e1b39976
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Sat Jan 18 01:59:55 2020 +0000

    [CODEC-264]: Ensure hash128 maintains the sign extension bug.
    
    The hash128(...) method was calling the public hash128x86(...) method
    with an int seed when it should have called the private hash128x86(...)
    with a long seed. Thus it did not have the sign extension error. The
    internal method has been renamed to avoid a name clash with the public
    API. The old hash128() behaviour is now restored as the seed is passed
    to the hash method with implicit long conversion.
---
 src/changes/changes.xml                            |  4 +++
 .../apache/commons/codec/digest/MurmurHash3.java   | 11 +++---
 .../commons/codec/digest/MurmurHash3Test.java      | 39 +++++++++++++++++++++-
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 7c95625..c907953 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -42,6 +42,10 @@ The <action> type attribute can be add,update,fix,remove.
   </properties>
   <body>
 
+    <release version="1.15" date="YYYY-MM-DD" description="Feature and fix release.">
+      <action issue="CODEC-264" dev="aherbert" due-to="Andy Seaborne" type="fix">MurmurHash3: Ensure hash128 maintains the sign extension bug.</action>
+    </release>
+
     <release version="1.14" date="2019-12-30" description="Feature and fix release.">
       <action issue="CODEC-261" dev="aherbert" type="fix">Hex: Allow encoding read-only ByteBuffer.</action>
       <action issue="CODEC-259" dev="aherbert" type="fix">Hex: Only use an available ByteBuffer backing array if the length equals the remaining byte count.</action>
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index d6f45c1..ba73ef6 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -800,9 +800,12 @@ public final class MurmurHash3 {
     @Deprecated
     public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
         // ************
-        // Note: This fails to apply masking using 0xffffffffL to the seed.
+        // Note: This deliberately fails to apply masking using 0xffffffffL to the seed
+        // to maintain behavioural compatibility with the original version.
+        // The implicit conversion to a long will extend a negative sign
+        // bit through the upper 32-bits of the long seed. These should be zero.
         // ************
-        return hash128x64(data, offset, length, seed);
+        return hash128x64Internal(data, offset, length, seed);
     }
 
     /**
@@ -820,7 +823,7 @@ public final class MurmurHash3 {
      */
     public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
         // Use an unsigned 32-bit integer as the seed
-        return hash128x64(data, offset, length, seed & 0xffffffffL);
+        return hash128x64Internal(data, offset, length, seed & 0xffffffffL);
     }
 
     /**
@@ -835,7 +838,7 @@ public final class MurmurHash3 {
      * @param seed The initial seed value
      * @return The 128-bit hash (2 longs)
      */
-    private static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) {
+    private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) {
         long h1 = seed;
         long h2 = seed;
         final int nblocks = length >> 4;
diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
index 05f78f7..6f31cf7 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
@@ -559,7 +559,7 @@ public class MurmurHash3Test {
             {1237530251176898868L, 6144786892208594932L}, {2347717913548230384L, -7461066668225718223L},
             {-7963311463560798404L, 8435801462986138227L}, {-7493166089060196513L, 8163503673197886404L},
             {6807249306539951962L, -1438886581269648819L}, {6752656991043418179L, 6334147827922066123L},
-            {-4534351735605790331L, -4530801663887858236L}, {-7886946241830957955L, -6261339648449285315L},        };
+            {-4534351735605790331L, -4530801663887858236L}, {-7886946241830957955L, -6261339648449285315L},};
         for (int i = 0; i < answers.length; i++) {
             final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
             Assert.assertArrayEquals(answers[i], MurmurHash3.hash128(bytes));
@@ -606,6 +606,43 @@ public class MurmurHash3Test {
     }
 
     /**
+     * Test the {@link MurmurHash3#hash128(byte[], int, int, int)} algorithm.
+     *
+     * <p>Explicit test for a negative seed. The original implementation has a sign extension error
+     * for negative seeds. This test is here to maintain behavioural compatibility of the
+     * broken deprecated method.
+     */
+    @Test
+    public void testHash128WithOffsetLengthAndNegativeSeed() {
+        // Seed can be negative
+        final int seed = -42;
+        final int offset = 13;
+
+        // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to
+        // 15 bytes remaining.
+        final long[][] answers = {{5954234972212089025L, 3342108296337967352L},
+                {8501094764898402923L, 7873951092908129427L}, {-3334322685492296196L, -2842715922956549478L},
+                {-2177918982459519644L, -1612349961980368636L}, {4172870320608886992L, -4177375712254136503L},
+                {7546965006884307324L, -5222114032564054641L}, {-2885083166621537267L, -2069868899915344482L},
+                {-2397098497873118851L, 4990578036999888806L}, {-706479374719025018L, 7531201699171849870L},
+                {6487943141157228609L, 3576221902299447884L}, {6671331646806999453L, -3428049860825046360L},
+                {-8700221138601237020L, -2748450904559980545L}, {-9028762509863648063L, 6130259301750313402L},
+                {729958512305702590L, -36367317333638988L}, {-3803232241584178983L, -4257744207892929651L},
+                {5734013720237474696L, -760784490666078990L}, {-6097477411153026656L, 625288777006549065L},
+                {1320365359996757504L, -2251971390373072541L}, {5551441703887653022L, -3516892619809375941L},
+                {698875391638415033L, 3456972931370496131L}, {5874956830271303805L, -6074126509360777023L},
+                {-7030758398537734781L, -3174643657101295554L}, {6835789852786226556L, 7245353136839389595L},
+                {-7755767580598793204L, -6680491060945077989L}, {-3099789923710523185L, -2751080516924952518L},
+                {-7772046549951435453L, 5263206145535830491L}, {7458715941971015543L, 5470582752171544854L},
+                {-7753394773760064468L, -2330157750295630617L}, {-5899278942232791979L, 6235686401271389982L},
+                {4881732293467626532L, 2617335658565007304L}, {-5722863941703478257L, -5424475653939430258L},
+                {-3703319768293496315L, -2124426428486426443L},};
+        for (int i = 0; i < answers.length; i++) {
+            Assert.assertArrayEquals("Length: " + i, answers[i], MurmurHash3.hash128(RANDOM_BYTES, offset, i, seed));
+        }
+    }
+
+    /**
      * Test the {@link MurmurHash3#hash128(String)} algorithm. This only tests it can return
      * the same value as {@link MurmurHash3#hash128(byte[], int, int, int)} if the string
      * is converted to bytes using the method {@link String#getBytes()}.


Re: [commons-codec] branch master updated: [CODEC-264]: Ensure hash128 maintains the sign extension bug.

Posted by Gary Gregory <ga...@gmail.com>.
On Sat, Jan 18, 2020 at 7:35 PM Bruno P. Kinoshita
<br...@yahoo.com.br.invalid> wrote:

> I think we should revert the change in a 1.14.1 or 1.15, and include in
> the release notes what happened.
> From what I understood, this would not break 1.14 users that were not
> using the deprecated methods. And would not affect 1.13 users upgrading to
> 1.14.1 or 1.15 (whichever we choose next).
> It would affect only 1.13 users, who upgraded to 1.14 and kept using the
> deprecated methods, and also store the hashes somewhere?
> And thanks for the summary of the change.
>

My main concern is what will be best for Commons Collections' Bloom
filters. That said, we need to document clearly in the Javadoc what users
can expect.

Gary


> Cheers
> Bruno
> Sent from Yahoo Mail on Android

Re: [commons-codec] branch master updated: [CODEC-264]: Ensure hash128 maintains the sign extension bug.

Posted by "Bruno P. Kinoshita" <br...@yahoo.com.br.INVALID>.
I think we should revert the change in a 1.14.1 or 1.15, and include in the release notes what happened.
From what I understood, this would not break 1.14 users that were not using the deprecated methods. And would not affect 1.13 users upgrading to 1.14.1 or 1.15 (whichever we choose next).
It would affect only 1.13 users, who upgraded to 1.14 and kept using the deprecated methods, and also store the hashes somewhere?
And thanks for the summary of the change. 
Cheers
Bruno
Sent from Yahoo Mail on Android

Re: [commons-codec] branch master updated: [CODEC-264]: Ensure hash128 maintains the sign extension bug.

Posted by Alex Herbert <al...@gmail.com>.

> On 18 Jan 2020, at 16:09, Gary Gregory <ga...@gmail.com> wrote:
> 
> Any thoughts of the effect of this on the Commons Collections Bloom filter
> proposal?
> 

It would have no effect. Claude’s Bloom filter code was what spotted the bug in the MurmurHash3 implementations. We fixed hash32 and hash128 by creating new methods. I presume the Bloom filter code is using the new implementations.

The real issue is that hash128 was meant to remain unchanged. However it was pointed out by Andy Seaborne that the old hash128 implementation calls the new hash128x64 implementation. So the old one is no longer broken. The idea was to keep it broken. Unfortunately somewhere in the history of the changes I dropped the cast of the int to a long and the old method called the wrong new version with an int seed not a long seed. I cannot see where this happened in the git history but the is code is like that with the last commits from me so I must have done it. I see the result as:

- codec 1.14 hash128 works differently from codec 1.13 hash128 if you use the methods and pass them a negative seed
- codec 1.14 marks all the hash128 methods as deprecated so there is a warning there that something has changed
- unfortunately the warning states the hash was wrong and advises you to update, where as it actually updates the old hash too so a user who reads the deprecation comment is mislead

This fix restores the hash128 back to the broken version which may be important for someone who started using it since 1.13 and wants to maintain the same hashes. So they should revert back to 1.13 if they wish to do this and wait for 1.15 (or 1.14.1) to get upgrades to codec. But how do we announce this to users?

Do we:

- push out a quick 1.14.1 patch release
- send out an e-mail to the users mailing list advising to revert to 1.13 if hash128 is key to their app
- just leave it and wait for complaints to roll in (if any ever do)

The safest option is a patch release which then goes through the announce channels and this issue is highlighted as a regression.

I wondered about not doing a release based on my uses cases for a hash. This would be to do a quick check to avoid having to do a more intensive check. In this case if the hash is different I just end up having to do a bit more work until all the new hashes trickle though replacing the old ones. This could be a problem if you have millions of hashes. However there could be cases where the hash is a more permanent part of the system and changing its functionality would break the system. In this instance a user would have to track the bug to the change in codec and have to revert to 1.13. 
 
WDYT?

Alex


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
For additional commands, e-mail: dev-help@commons.apache.org


Re: [commons-codec] branch master updated: [CODEC-264]: Ensure hash128 maintains the sign extension bug.

Posted by Gary Gregory <ga...@gmail.com>.
Any thoughts of the effect of this on the Commons Collections Bloom filter
proposal?

Gary

On Fri, Jan 17, 2020 at 9:00 PM <ah...@apache.org> wrote:

> This is an automated email from the ASF dual-hosted git repository.
>
> aherbert pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/commons-codec.git
>
>
> The following commit(s) were added to refs/heads/master by this push:
>      new f5a61f0  [CODEC-264]: Ensure hash128 maintains the sign extension
> bug.
> f5a61f0 is described below
>
> commit f5a61f0cd029f18666163f414f848ba0e1b39976
> Author: Alex Herbert <ah...@apache.org>
> AuthorDate: Sat Jan 18 01:59:55 2020 +0000
>
>     [CODEC-264]: Ensure hash128 maintains the sign extension bug.
>
>     The hash128(...) method was calling the public hash128x86(...) method
>     with an int seed when it should have called the private hash128x86(...)
>     with a long seed. Thus it did not have the sign extension error. The
>     internal method has been renamed to avoid a name clash with the public
>     API. The old hash128() behaviour is now restored as the seed is passed
>     to the hash method with implicit long conversion.
> ---
>  src/changes/changes.xml                            |  4 +++
>  .../apache/commons/codec/digest/MurmurHash3.java   | 11 +++---
>  .../commons/codec/digest/MurmurHash3Test.java      | 39
> +++++++++++++++++++++-
>  3 files changed, 49 insertions(+), 5 deletions(-)
>
> diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> index 7c95625..c907953 100644
> --- a/src/changes/changes.xml
> +++ b/src/changes/changes.xml
> @@ -42,6 +42,10 @@ The <action> type attribute can be
> add,update,fix,remove.
>    </properties>
>    <body>
>
> +    <release version="1.15" date="YYYY-MM-DD" description="Feature and
> fix release.">
> +      <action issue="CODEC-264" dev="aherbert" due-to="Andy Seaborne"
> type="fix">MurmurHash3: Ensure hash128 maintains the sign extension
> bug.</action>
> +    </release>
> +
>      <release version="1.14" date="2019-12-30" description="Feature and
> fix release.">
>        <action issue="CODEC-261" dev="aherbert" type="fix">Hex: Allow
> encoding read-only ByteBuffer.</action>
>        <action issue="CODEC-259" dev="aherbert" type="fix">Hex: Only use
> an available ByteBuffer backing array if the length equals the remaining
> byte count.</action>
> diff --git
> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> index d6f45c1..ba73ef6 100644
> --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> @@ -800,9 +800,12 @@ public final class MurmurHash3 {
>      @Deprecated
>      public static long[] hash128(final byte[] data, final int offset,
> final int length, final int seed) {
>          // ************
> -        // Note: This fails to apply masking using 0xffffffffL to the
> seed.
> +        // Note: This deliberately fails to apply masking using
> 0xffffffffL to the seed
> +        // to maintain behavioural compatibility with the original
> version.
> +        // The implicit conversion to a long will extend a negative sign
> +        // bit through the upper 32-bits of the long seed. These should
> be zero.
>          // ************
> -        return hash128x64(data, offset, length, seed);
> +        return hash128x64Internal(data, offset, length, seed);
>      }
>
>      /**
> @@ -820,7 +823,7 @@ public final class MurmurHash3 {
>       */
>      public static long[] hash128x64(final byte[] data, final int offset,
> final int length, final int seed) {
>          // Use an unsigned 32-bit integer as the seed
> -        return hash128x64(data, offset, length, seed & 0xffffffffL);
> +        return hash128x64Internal(data, offset, length, seed &
> 0xffffffffL);
>      }
>
>      /**
> @@ -835,7 +838,7 @@ public final class MurmurHash3 {
>       * @param seed The initial seed value
>       * @return The 128-bit hash (2 longs)
>       */
> -    private static long[] hash128x64(final byte[] data, final int offset,
> final int length, final long seed) {
> +    private static long[] hash128x64Internal(final byte[] data, final int
> offset, final int length, final long seed) {
>          long h1 = seed;
>          long h2 = seed;
>          final int nblocks = length >> 4;
> diff --git
> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> index 05f78f7..6f31cf7 100644
> --- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> +++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> @@ -559,7 +559,7 @@ public class MurmurHash3Test {
>              {1237530251176898868L, 6144786892208594932L},
> {2347717913548230384L, -7461066668225718223L},
>              {-7963311463560798404L, 8435801462986138227L},
> {-7493166089060196513L, 8163503673197886404L},
>              {6807249306539951962L, -1438886581269648819L},
> {6752656991043418179L, 6334147827922066123L},
> -            {-4534351735605790331L, -4530801663887858236L},
> {-7886946241830957955L, -6261339648449285315L},        };
> +            {-4534351735605790331L, -4530801663887858236L},
> {-7886946241830957955L, -6261339648449285315L},};
>          for (int i = 0; i < answers.length; i++) {
>              final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i);
>              Assert.assertArrayEquals(answers[i],
> MurmurHash3.hash128(bytes));
> @@ -606,6 +606,43 @@ public class MurmurHash3Test {
>      }
>
>      /**
> +     * Test the {@link MurmurHash3#hash128(byte[], int, int, int)}
> algorithm.
> +     *
> +     * <p>Explicit test for a negative seed. The original implementation
> has a sign extension error
> +     * for negative seeds. This test is here to maintain behavioural
> compatibility of the
> +     * broken deprecated method.
> +     */
> +    @Test
> +    public void testHash128WithOffsetLengthAndNegativeSeed() {
> +        // Seed can be negative
> +        final int seed = -42;
> +        final int offset = 13;
> +
> +        // Test with all sizes up to 31 bytes. This ensures a full round
> of 16-bytes plus up to
> +        // 15 bytes remaining.
> +        final long[][] answers = {{5954234972212089025L,
> 3342108296337967352L},
> +                {8501094764898402923L, 7873951092908129427L},
> {-3334322685492296196L, -2842715922956549478L},
> +                {-2177918982459519644L, -1612349961980368636L},
> {4172870320608886992L, -4177375712254136503L},
> +                {7546965006884307324L, -5222114032564054641L},
> {-2885083166621537267L, -2069868899915344482L},
> +                {-2397098497873118851L, 4990578036999888806L},
> {-706479374719025018L, 7531201699171849870L},
> +                {6487943141157228609L, 3576221902299447884L},
> {6671331646806999453L, -3428049860825046360L},
> +                {-8700221138601237020L, -2748450904559980545L},
> {-9028762509863648063L, 6130259301750313402L},
> +                {729958512305702590L, -36367317333638988L},
> {-3803232241584178983L, -4257744207892929651L},
> +                {5734013720237474696L, -760784490666078990L},
> {-6097477411153026656L, 625288777006549065L},
> +                {1320365359996757504L, -2251971390373072541L},
> {5551441703887653022L, -3516892619809375941L},
> +                {698875391638415033L, 3456972931370496131L},
> {5874956830271303805L, -6074126509360777023L},
> +                {-7030758398537734781L, -3174643657101295554L},
> {6835789852786226556L, 7245353136839389595L},
> +                {-7755767580598793204L, -6680491060945077989L},
> {-3099789923710523185L, -2751080516924952518L},
> +                {-7772046549951435453L, 5263206145535830491L},
> {7458715941971015543L, 5470582752171544854L},
> +                {-7753394773760064468L, -2330157750295630617L},
> {-5899278942232791979L, 6235686401271389982L},
> +                {4881732293467626532L, 2617335658565007304L},
> {-5722863941703478257L, -5424475653939430258L},
> +                {-3703319768293496315L, -2124426428486426443L},};
> +        for (int i = 0; i < answers.length; i++) {
> +            Assert.assertArrayEquals("Length: " + i, answers[i],
> MurmurHash3.hash128(RANDOM_BYTES, offset, i, seed));
> +        }
> +    }
> +
> +    /**
>       * Test the {@link MurmurHash3#hash128(String)} algorithm. This only
> tests it can return
>       * the same value as {@link MurmurHash3#hash128(byte[], int, int,
> int)} if the string
>       * is converted to bytes using the method {@link String#getBytes()}.
>
>