You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2019/12/29 21:33:18 UTC

[commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git


The following commit(s) were added to refs/heads/master by this push:
     new f40005a  [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.
f40005a is described below

commit f40005ad5a9c892fa8d216b12029a49062224351
Author: Gary Gregory <ga...@gmail.com>
AuthorDate: Sun Dec 29 16:33:14 2019 -0500

    [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.
---
 src/changes/changes.xml                            |  1 +
 .../apache/commons/codec/digest/MurmurHash2.java   | 22 ++++++++++++++++------
 .../apache/commons/codec/digest/MurmurHash3.java   | 18 ++++++++++++++----
 .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 55ba445..9e1c3c4 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -55,6 +55,7 @@ The <action> type attribute can be add,update,fix,remove.
       <action issue="CODEC-273" dev="ggregory" type="add" due-to="Gary Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils similar to File APIs.</action>
       <action issue="CODEC-274" dev="ggregory" type="add" due-to="Gary Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9 and up.</action>
       <action issue="CODEC-275" dev="ggregory" type="add" due-to="Claude Warren">Add missing note in javadoc when sign extension error is present #34.</action>
+      <action issue="CODEC-276" dev="ggregory" type="add" due-to="Gary Gregory">Reliance on default encoding in MurmurHash2 and MurmurHash3.</action>
     </release>
 
     <release version="1.13" date="2019-07-20" description="Feature and fix release.">
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
index 5b0a712..4dfeca9 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
@@ -17,6 +17,9 @@
 
 package org.apache.commons.codec.digest;
 
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
 /**
  * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
  *
@@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
  */
 public final class MurmurHash2 {
 
+    /**
+     * Default Charset used to convert strings into bytes.
+     * 
+     * Consider private; package private for tests only.
+     */
+    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
+
     // Constants for 32-bit variant
     private static final int M32 = 0x5bd1e995;
     private static final int R32 = 24;
@@ -132,7 +142,7 @@ public final class MurmurHash2 {
      *
      * <pre>
      * int seed = 0x9747b28c;
-     * byte[] bytes = data.getBytes();
+     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
      * </pre>
      *
@@ -141,7 +151,7 @@ public final class MurmurHash2 {
      * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text) {
-        final byte[] bytes = text.getBytes();
+        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
         return hash32(bytes, bytes.length);
     }
 
@@ -152,7 +162,7 @@ public final class MurmurHash2 {
      *
      * <pre>
      * int seed = 0x9747b28c;
-     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
      * </pre>
      *
@@ -243,7 +253,7 @@ public final class MurmurHash2 {
      *
      * <pre>
      * int seed = 0xe17a1465;
-     * byte[] bytes = data.getBytes();
+     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
      * </pre>
      *
@@ -252,7 +262,7 @@ public final class MurmurHash2 {
      * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text) {
-        final byte[] bytes = text.getBytes();
+        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
         return hash64(bytes, bytes.length);
     }
 
@@ -263,7 +273,7 @@ public final class MurmurHash2 {
      *
      * <pre>
      * int seed = 0xe17a1465;
-     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
      * </pre>
      *
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index cfeb3d1..4509a8f 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -17,6 +17,9 @@
 
 package org.apache.commons.codec.digest;
 
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
 /**
  * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
  *
@@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
 public final class MurmurHash3 {
 
     /**
+     * Default Charset used to convert strings into bytes.
+     * 
+     * Consider private; package private for tests only.
+     */
+    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
+
+    /**
      * A random number to use for a hash code.
      *
      * @deprecated This is not used internally and will be removed in a future release.
@@ -233,7 +243,7 @@ public final class MurmurHash3 {
      * <pre>
      * int offset = 0;
      * int seed = 104729;
-     * byte[] bytes = data.getBytes();
+     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
      * </pre>
      *
@@ -249,7 +259,7 @@ public final class MurmurHash3 {
      */
     @Deprecated
     public static int hash32(final String data) {
-        final byte[] bytes = data.getBytes();
+        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
     }
 
@@ -751,7 +761,7 @@ public final class MurmurHash3 {
      * <pre>
      * int offset = 0;
      * int seed = 104729;
-     * byte[] bytes = data.getBytes();
+     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
      * </pre>
      *
@@ -766,7 +776,7 @@ public final class MurmurHash3 {
      */
     @Deprecated
     public static long[] hash128(final String data) {
-        final byte[] bytes = data.getBytes();
+        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
     }
 
diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
index 4703f9f..61df7f2 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
@@ -355,7 +355,7 @@ public class MurmurHash3Test {
                 pos += Character.toChars(codePoint, chars, pos);
             }
             final String text = String.copyValueOf(chars, 0, pos);
-            final byte[] bytes = text.getBytes();
+            final byte[] bytes = text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
             final int h1 = MurmurHash3.hash32(bytes, 0, bytes.length, seed);
             final int h2 = MurmurHash3.hash32(text);
             Assert.assertEquals(h1, h2);
@@ -455,7 +455,7 @@ public class MurmurHash3Test {
      */
     @Test
     public void testHash64() {
-        final byte[] origin = TEST_HASH64.getBytes();
+        final byte[] origin = TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
         final long hash = MurmurHash3.hash64(origin);
         Assert.assertEquals(5785358552565094607L, hash);
     }
@@ -466,7 +466,7 @@ public class MurmurHash3Test {
      */
     @Test
     public void testHash64WithOffsetAndLength() {
-        final byte[] origin = TEST_HASH64.getBytes();
+        final byte[] origin = TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
         final byte[] originOffset = new byte[origin.length + 150];
         Arrays.fill(originOffset, (byte) 123);
         System.arraycopy(origin, 0, originOffset, 150, origin.length);
@@ -627,7 +627,7 @@ public class MurmurHash3Test {
                 pos += Character.toChars(codePoint, chars, pos);
             }
             final String text = String.copyValueOf(chars, 0, pos);
-            final byte[] bytes = text.getBytes();
+            final byte[] bytes = text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
             final long[] h1 = MurmurHash3.hash128(bytes, 0, bytes.length, seed);
             final long[] h2 = MurmurHash3.hash128(text);
             Assert.assertArrayEquals(h1, h2);


Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Alex Herbert <al...@gmail.com>.

> On 30 Dec 2019, at 21:26, Alex Herbert <al...@gmail.com> wrote:
> 
> 
> 
>> On 30 Dec 2019, at 14:58, Gary Gregory <garydgregory@gmail.com <ma...@gmail.com>> wrote:
>> 
>> On Mon, Dec 30, 2019 at 9:38 AM Alex Herbert <alex.d.herbert@gmail.com <ma...@gmail.com>>
>> wrote:
>> 
>>> On Mon, 30 Dec 2019, 14:19 Gary Gregory, <garydgregory@gmail.com <ma...@gmail.com>> wrote:
>>> 
>>>> On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <alex.d.herbert@gmail.com <ma...@gmail.com>>
>>>> wrote:
>>>> 
>>>>> On my second review I fixed the potential overflow in the incremental
>>>> hash
>>>>> for murmur3.
>>>> 
>>>> 
>>>> I did note that fix flying by in a commit but I would have loved to see
>>> it
>>>> accompanied by a // comment. Since I am not sure that fix came with a
>>> test,
>>>> the fix could likely be lost in a future edit by someone wanting to
>>> improve
>>>> the readability of the code, YMMV on "readability ;-)
>>>> 
>>>> Please do feel free to check for other edge case like this one.
>>>> 
>>> 
>>> I'll add a better comment later. Sorry.
>>> 
>>>> 
>>> I did not add a test as Travis may not run it. It would require a 2GB array
>>> to pass to the incremental. On the BaseNCodecTest I had to test for
>>> available memory to run large memory allocation tests using assume to skip
>>> it if there was not enough memory.
>>> 
>>> I'll try a test later and see if it works on Travis. The 3gb limit may be
>>> enough to run this one. It will be interesting to see how long it takes to
>>> hash a 2gb array.
>>> 
>>> The base 64 encoding of a 1gb array is a fair chunk of the test suite time
>>> if it runs. It requires about 5gb of memory to run.
>>> 
>>> I have memory and speed improvements for BaseNCodec but not enough time to
>>> finish them for this release. That can wait for next year.
>>> 
>> 
>> That all sounds good. We can do a 1.14.1 release in January for performance
>> improvements.
>> 
> 
> I added a test for overflow in MurmurHash3Test. Using the original code this test throws. With the overflow safe length check it is OK. This is all locally with a lot of RAM to run the test.
> 
> Travis did not run the test. It gets a ’Skipped 1’ in JDK 8, 11 and 13. I did not check the others. I presume this is due to memory requirements.
> 
> I’ve had a scan through the code and this same overflow during incremental update is a problem in:
> 
> XXHash32
> 
> Other classes implementing Checksum (PureJavaCrc32, PureJavaCrc32C) use a different method to update from a byte array of a given length.
> 
> The overflow in XXHash32 can be solved with the same fix used in MurmurHash3 incremental hash. There is also an overflow issue in the getValue() method of XXHash32:
> 
>         if (totalLen > BUF_SIZE) {
> 
> Here totalLen is being used to store the number of bytes processed. All this check requires is to know that the state array has been updated since the reset with the original seed. If totalLen overflows to negative then the hash will be incorrect as it will not use the entire state array.
> 
> To really check this would require a XXHash32 implementation capable of hashing >2^31 bytes to get a reference answer. However a simple change to use a boolean flag to hold the state updated condition would fix the bug and pass all the current tests.
> 
> WDYT?

Since this was a simple fix I made the changes to XXHash32. I added tests so the class now has 100% coverage. Previously edge cases were not being hit.

The test using a slightly smaller large array than MurmurHash3Test also fails to run on travis. I expect these will be run when verifying a RC as a local build will probably have more RAM available. This will prevent regression.

I had to download from xxHash the reference implementation. There is a xxHash64 version available. This may be of interest in a future codec release.

> 
> 
>> Gary
>> 
>> 
>>> 
>>> 
>>> 
>>>> Gary
>>>> 
>>>> 
>>>>> Maybe this type of code is present elsewhere in codec for
>>>>> other incremental algorithms. Basically you have to guard against
>>> someone
>>>>> incrementally adding max length arrays which will overflow a small
>>> count
>>>> of
>>>>> unprocessed bytes held as an integer.
>>>>> 
>>>>> I'm not at the computer now so can't check for other incrementals. I
>>> can
>>>> do
>>>>> this later if you want or you could have a look. It's not a major
>>> thing.
>>>>> You would have to want to break the code to find this.
>>>>> 
>>>>> Alex
>>>>> 
>>>>> 
>>>>> 
>>>>> 
>>>>> On Mon, 30 Dec 2019, 14:02 Gary Gregory, <garydgregory@gmail.com <ma...@gmail.com>>
>>> wrote:
>>>>> 
>>>>>> On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <
>>> alex.d.herbert@gmail.com <ma...@gmail.com>
>>>>> 
>>>>>> wrote:
>>>>>> 
>>>>>>> OK.
>>>>>>> 
>>>>>>> If you are going to make the behaviour change for the string
>>> methods
>>>>> then
>>>>>>> the javadoc should be updated. It currently states:
>>>>>>> 
>>>>>>> "The string is converted to bytes using the default encoding.”
>>>>>>> 
>>>>>>> So perhaps switch this to:
>>>>>>> 
>>>>>>> “Before 1.14 the string was converted using default encoding. Since
>>>>> 1.14
>>>>>>> the string is converted to bytes using UTF-8 encoding.”
>>>>>>> 
>>>>>>> The MurmurHash3 methods should still be deprecated as they call the
>>>>>>> hash32/hash128 functions that have sign bugs. I deliberately did
>>> not
>>>>> add
>>>>>> a
>>>>>>> new method accepting a String for the fixed murmur3 algorithm. IMO
>>> it
>>>>> is
>>>>>>> unwarranted API bloat.
>>>>>>> 
>>>>>>> MurmurHash2 is not deprecated. In that case the String methods are
>>>> just
>>>>>>> API bloat but the javadoc should be updated.
>>>>>>> 
>>>>>>> Alex
>>>>>>> 
>>>>>> 
>>>>>> Thank you Alex, please review Javadoc changes in git master.
>>>>>> 
>>>>>> Also, if there is anything else that can be improved in code,
>>> Javadoc,
>>>> or
>>>>>> test coverage, please let me know so I can delay creating the release
>>>>>> candidate, which will likely happen tonight (US EST) time permitting.
>>>>>> 
>>>>>> Gary
>>>>>> 
>>>>>> 
>>>>>>> 
>>>>>>>> On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
>>>>>>>> 
>>>>>>>> This is an automated email from the ASF dual-hosted git
>>> repository.
>>>>>>>> 
>>>>>>>> ggregory pushed a commit to branch master
>>>>>>>> in repository
>>>> https://gitbox.apache.org/repos/asf/commons-codec.git
>>>>>>>> 
>>>>>>>> 
>>>>>>>> The following commit(s) were added to refs/heads/master by this
>>>> push:
>>>>>>>>    new f40005a  [CODEC-276] Reliance on default encoding in
>>>>>> MurmurHash2
>>>>>>> and MurmurHash3.
>>>>>>>> f40005a is described below
>>>>>>>> 
>>>>>>>> commit f40005ad5a9c892fa8d216b12029a49062224351
>>>>>>>> Author: Gary Gregory <ga...@gmail.com>
>>>>>>>> AuthorDate: Sun Dec 29 16:33:14 2019 -0500
>>>>>>>> 
>>>>>>>>   [CODEC-276] Reliance on default encoding in MurmurHash2 and
>>>>>>> MurmurHash3.
>>>>>>>> ---
>>>>>>>> src/changes/changes.xml                            |  1 +
>>>>>>>> .../apache/commons/codec/digest/MurmurHash2.java   | 22
>>>>>>> ++++++++++++++++------
>>>>>>>> .../apache/commons/codec/digest/MurmurHash3.java   | 18
>>>>>>> ++++++++++++++----
>>>>>>>> .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
>>>>>>>> 4 files changed, 35 insertions(+), 14 deletions(-)
>>>>>>>> 
>>>>>>>> diff --git a/src/changes/changes.xml b/src/changes/changes.xml
>>>>>>>> index 55ba445..9e1c3c4 100644
>>>>>>>> --- a/src/changes/changes.xml
>>>>>>>> +++ b/src/changes/changes.xml
>>>>>>>> @@ -55,6 +55,7 @@ The <action> type attribute can be
>>>>>>> add,update,fix,remove.
>>>>>>>>      <action issue="CODEC-273" dev="ggregory" type="add"
>>>>> due-to="Gary
>>>>>>> Gregory">Add Path APIs to
>>> org.apache.commons.codec.digest.DigestUtils
>>>>>>> similar to File APIs.</action>
>>>>>>>>      <action issue="CODEC-274" dev="ggregory" type="add"
>>>>> due-to="Gary
>>>>>>> Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9
>>>> and
>>>>>>> up.</action>
>>>>>>>>      <action issue="CODEC-275" dev="ggregory" type="add"
>>>>>> due-to="Claude
>>>>>>> Warren">Add missing note in javadoc when sign extension error is
>>>>> present
>>>>>>> #34.</action>
>>>>>>>> +      <action issue="CODEC-276" dev="ggregory" type="add"
>>>>> due-to="Gary
>>>>>>> Gregory">Reliance on default encoding in MurmurHash2 and
>>>>>>> MurmurHash3.</action>
>>>>>>>>    </release>
>>>>>>>> 
>>>>>>>>    <release version="1.13" date="2019-07-20"
>>> description="Feature
>>>>> and
>>>>>>> fix release.">
>>>>>>>> diff --git
>>>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>>> index 5b0a712..4dfeca9 100644
>>>>>>>> ---
>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>>> +++
>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>>> @@ -17,6 +17,9 @@
>>>>>>>> 
>>>>>>>> package org.apache.commons.codec.digest;
>>>>>>>> 
>>>>>>>> +import java.nio.charset.Charset;
>>>>>>>> +import java.nio.charset.StandardCharsets;
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * Implementation of the MurmurHash2 32-bit and 64-bit hash
>>>>> functions.
>>>>>>>> *
>>>>>>>> @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
>>>>>>>> */
>>>>>>>> public final class MurmurHash2 {
>>>>>>>> 
>>>>>>>> +    /**
>>>>>>>> +     * Default Charset used to convert strings into bytes.
>>>>>>>> +     *
>>>>>>>> +     * Consider private; package private for tests only.
>>>>>>>> +     */
>>>>>>>> +    static final Charset GET_BYTES_CHARSET =
>>>> StandardCharsets.UTF_8;
>>>>>>>> +
>>>>>>>>    // Constants for 32-bit variant
>>>>>>>>    private static final int M32 = 0x5bd1e995;
>>>>>>>>    private static final int R32 = 24;
>>>>>>>> @@ -132,7 +142,7 @@ public final class MurmurHash2 {
>>>>>>>>     *
>>>>>>>>     * <pre>
>>>>>>>>     * int seed = 0x9747b28c;
>>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> @@ -141,7 +151,7 @@ public final class MurmurHash2 {
>>>>>>>>     * @see #hash32(byte[], int, int)
>>>>>>>>     */
>>>>>>>>    public static int hash32(final String text) {
>>>>>>>> -        final byte[] bytes = text.getBytes();
>>>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>>>>>>>>        return hash32(bytes, bytes.length);
>>>>>>>>    }
>>>>>>>> 
>>>>>>>> @@ -152,7 +162,7 @@ public final class MurmurHash2 {
>>>>>>>>     *
>>>>>>>>     * <pre>
>>>>>>>>     * int seed = 0x9747b28c;
>>>>>>>> -     * byte[] bytes = text.substring(from, from +
>>>>> length).getBytes();
>>>>>>>> +     * byte[] bytes = text.substring(from, from +
>>>>>>> length).getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> @@ -243,7 +253,7 @@ public final class MurmurHash2 {
>>>>>>>>     *
>>>>>>>>     * <pre>
>>>>>>>>     * int seed = 0xe17a1465;
>>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> @@ -252,7 +262,7 @@ public final class MurmurHash2 {
>>>>>>>>     * @see #hash64(byte[], int, int)
>>>>>>>>     */
>>>>>>>>    public static long hash64(final String text) {
>>>>>>>> -        final byte[] bytes = text.getBytes();
>>>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>>>>>>>>        return hash64(bytes, bytes.length);
>>>>>>>>    }
>>>>>>>> 
>>>>>>>> @@ -263,7 +273,7 @@ public final class MurmurHash2 {
>>>>>>>>     *
>>>>>>>>     * <pre>
>>>>>>>>     * int seed = 0xe17a1465;
>>>>>>>> -     * byte[] bytes = text.substring(from, from +
>>>>> length).getBytes();
>>>>>>>> +     * byte[] bytes = text.substring(from, from +
>>>>>>> length).getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> diff --git
>>>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>>> index cfeb3d1..4509a8f 100644
>>>>>>>> ---
>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>>> +++
>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>>> @@ -17,6 +17,9 @@
>>>>>>>> 
>>>>>>>> package org.apache.commons.codec.digest;
>>>>>>>> 
>>>>>>>> +import java.nio.charset.Charset;
>>>>>>>> +import java.nio.charset.StandardCharsets;
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * Implementation of the MurmurHash3 32-bit and 128-bit hash
>>>>> functions.
>>>>>>>> *
>>>>>>>> @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
>>>>>>>> public final class MurmurHash3 {
>>>>>>>> 
>>>>>>>>    /**
>>>>>>>> +     * Default Charset used to convert strings into bytes.
>>>>>>>> +     *
>>>>>>>> +     * Consider private; package private for tests only.
>>>>>>>> +     */
>>>>>>>> +    static final Charset GET_BYTES_CHARSET =
>>>> StandardCharsets.UTF_8;
>>>>>>>> +
>>>>>>>> +    /**
>>>>>>>>     * A random number to use for a hash code.
>>>>>>>>     *
>>>>>>>>     * @deprecated This is not used internally and will be
>>> removed
>>>>> in a
>>>>>>> future release.
>>>>>>>> @@ -233,7 +243,7 @@ public final class MurmurHash3 {
>>>>>>>>     * <pre>
>>>>>>>>     * int offset = 0;
>>>>>>>>     * int seed = 104729;
>>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
>>>>>> seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> @@ -249,7 +259,7 @@ public final class MurmurHash3 {
>>>>>>>>     */
>>>>>>>>    @Deprecated
>>>>>>>>    public static int hash32(final String data) {
>>>>>>>> -        final byte[] bytes = data.getBytes();
>>>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>>>>>>>>        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
>>>>>>>>    }
>>>>>>>> 
>>>>>>>> @@ -751,7 +761,7 @@ public final class MurmurHash3 {
>>>>>>>>     * <pre>
>>>>>>>>     * int offset = 0;
>>>>>>>>     * int seed = 104729;
>>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>>     * int hash = MurmurHash3.hash128(bytes, offset,
>>> bytes.length,
>>>>>> seed);
>>>>>>>>     * </pre>
>>>>>>>>     *
>>>>>>>> @@ -766,7 +776,7 @@ public final class MurmurHash3 {
>>>>>>>>     */
>>>>>>>>    @Deprecated
>>>>>>>>    public static long[] hash128(final String data) {
>>>>>>>> -        final byte[] bytes = data.getBytes();
>>>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>>>>>>>>        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
>>>>>>>>    }
>>>>>>>> 
>>>>>>>> diff --git
>>>>>>> 
>>> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>> 
>>> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>>> index 4703f9f..61df7f2 100644
>>>>>>>> ---
>>>>>> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>>> +++
>>>>>> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>>> @@ -355,7 +355,7 @@ public class MurmurHash3Test {
>>>>>>>>                pos += Character.toChars(codePoint, chars, pos);
>>>>>>>>            }
>>>>>>>>            final String text = String.copyValueOf(chars, 0,
>>> pos);
>>>>>>>> -            final byte[] bytes = text.getBytes();
>>>>>>>> +            final byte[] bytes =
>>>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>>            final int h1 = MurmurHash3.hash32(bytes, 0,
>>>> bytes.length,
>>>>>>> seed);
>>>>>>>>            final int h2 = MurmurHash3.hash32(text);
>>>>>>>>            Assert.assertEquals(h1, h2);
>>>>>>>> @@ -455,7 +455,7 @@ public class MurmurHash3Test {
>>>>>>>>     */
>>>>>>>>    @Test
>>>>>>>>    public void testHash64() {
>>>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
>>>>>>>> +        final byte[] origin =
>>>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>>        final long hash = MurmurHash3.hash64(origin);
>>>>>>>>        Assert.assertEquals(5785358552565094607L, hash);
>>>>>>>>    }
>>>>>>>> @@ -466,7 +466,7 @@ public class MurmurHash3Test {
>>>>>>>>     */
>>>>>>>>    @Test
>>>>>>>>    public void testHash64WithOffsetAndLength() {
>>>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
>>>>>>>> +        final byte[] origin =
>>>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>>        final byte[] originOffset = new byte[origin.length +
>>> 150];
>>>>>>>>        Arrays.fill(originOffset, (byte) 123);
>>>>>>>>        System.arraycopy(origin, 0, originOffset, 150,
>>>>> origin.length);
>>>>>>>> @@ -627,7 +627,7 @@ public class MurmurHash3Test {
>>>>>>>>                pos += Character.toChars(codePoint, chars, pos);
>>>>>>>>            }
>>>>>>>>            final String text = String.copyValueOf(chars, 0,
>>> pos);
>>>>>>>> -            final byte[] bytes = text.getBytes();
>>>>>>>> +            final byte[] bytes =
>>>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>>            final long[] h1 = MurmurHash3.hash128(bytes, 0,
>>>>>>> bytes.length, seed);
>>>>>>>>            final long[] h2 = MurmurHash3.hash128(text);
>>>>>>>>            Assert.assertArrayEquals(h1, h2);
>>>>>>>> 
>>>>>>> 
>>>>>>> 
>>>>>>> 
>>> ---------------------------------------------------------------------
>>>>>>> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
>>>>>>> For additional commands, e-mail: dev-help@commons.apache.org


Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Gary Gregory <ga...@gmail.com>.
On Mon, Dec 30, 2019 at 4:26 PM Alex Herbert <al...@gmail.com>
wrote:

>
>
> > On 30 Dec 2019, at 14:58, Gary Gregory <ga...@gmail.com> wrote:
> >
> > On Mon, Dec 30, 2019 at 9:38 AM Alex Herbert <alex.d.herbert@gmail.com
> <ma...@gmail.com>>
> > wrote:
> >
> >> On Mon, 30 Dec 2019, 14:19 Gary Gregory, <ga...@gmail.com>
> wrote:
> >>
> >>> On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <alex.d.herbert@gmail.com
> >
> >>> wrote:
> >>>
> >>>> On my second review I fixed the potential overflow in the incremental
> >>> hash
> >>>> for murmur3.
> >>>
> >>>
> >>> I did note that fix flying by in a commit but I would have loved to see
> >> it
> >>> accompanied by a // comment. Since I am not sure that fix came with a
> >> test,
> >>> the fix could likely be lost in a future edit by someone wanting to
> >> improve
> >>> the readability of the code, YMMV on "readability ;-)
> >>>
> >>> Please do feel free to check for other edge case like this one.
> >>>
> >>
> >> I'll add a better comment later. Sorry.
> >>
> >>>
> >> I did not add a test as Travis may not run it. It would require a 2GB
> array
> >> to pass to the incremental. On the BaseNCodecTest I had to test for
> >> available memory to run large memory allocation tests using assume to
> skip
> >> it if there was not enough memory.
> >>
> >> I'll try a test later and see if it works on Travis. The 3gb limit may
> be
> >> enough to run this one. It will be interesting to see how long it takes
> to
> >> hash a 2gb array.
> >>
> >> The base 64 encoding of a 1gb array is a fair chunk of the test suite
> time
> >> if it runs. It requires about 5gb of memory to run.
> >>
> >> I have memory and speed improvements for BaseNCodec but not enough time
> to
> >> finish them for this release. That can wait for next year.
> >>
> >
> > That all sounds good. We can do a 1.14.1 release in January for
> performance
> > improvements.
> >
>
> I added a test for overflow in MurmurHash3Test. Using the original code
> this test throws. With the overflow safe length check it is OK. This is all
> locally with a lot of RAM to run the test.
>
> Travis did not run the test. It gets a ’Skipped 1’ in JDK 8, 11 and 13. I
> did not check the others. I presume this is due to memory requirements.
>
> I’ve had a scan through the code and this same overflow during incremental
> update is a problem in:
>
> XXHash32
>
> Other classes implementing Checksum (PureJavaCrc32, PureJavaCrc32C) use a
> different method to update from a byte array of a given length.
>
> The overflow in XXHash32 can be solved with the same fix used in
> MurmurHash3 incremental hash. There is also an overflow issue in the
> getValue() method of XXHash32:
>
>         if (totalLen > BUF_SIZE) {
>
> Here totalLen is being used to store the number of bytes processed. All
> this check requires is to know that the state array has been updated since
> the reset with the original seed. If totalLen overflows to negative then
> the hash will be incorrect as it will not use the entire state array.
>
> To really check this would require a XXHash32 implementation capable of
> hashing >2^31 bytes to get a reference answer. However a simple change to
> use a boolean flag to hold the state updated condition would fix the bug
> and pass all the current tests.
>
> WDYT?
>

Thank you for going through the code.
I just cut the RC. We can continue changes/fixes for the next release.

Gary


> > Gary
> >
> >
> >>
> >>
> >>
> >>> Gary
> >>>
> >>>
> >>>> Maybe this type of code is present elsewhere in codec for
> >>>> other incremental algorithms. Basically you have to guard against
> >> someone
> >>>> incrementally adding max length arrays which will overflow a small
> >> count
> >>> of
> >>>> unprocessed bytes held as an integer.
> >>>>
> >>>> I'm not at the computer now so can't check for other incrementals. I
> >> can
> >>> do
> >>>> this later if you want or you could have a look. It's not a major
> >> thing.
> >>>> You would have to want to break the code to find this.
> >>>>
> >>>> Alex
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com>
> >> wrote:
> >>>>
> >>>>> On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <
> >> alex.d.herbert@gmail.com
> >>>>
> >>>>> wrote:
> >>>>>
> >>>>>> OK.
> >>>>>>
> >>>>>> If you are going to make the behaviour change for the string
> >> methods
> >>>> then
> >>>>>> the javadoc should be updated. It currently states:
> >>>>>>
> >>>>>> "The string is converted to bytes using the default encoding.”
> >>>>>>
> >>>>>> So perhaps switch this to:
> >>>>>>
> >>>>>> “Before 1.14 the string was converted using default encoding. Since
> >>>> 1.14
> >>>>>> the string is converted to bytes using UTF-8 encoding.”
> >>>>>>
> >>>>>> The MurmurHash3 methods should still be deprecated as they call the
> >>>>>> hash32/hash128 functions that have sign bugs. I deliberately did
> >> not
> >>>> add
> >>>>> a
> >>>>>> new method accepting a String for the fixed murmur3 algorithm. IMO
> >> it
> >>>> is
> >>>>>> unwarranted API bloat.
> >>>>>>
> >>>>>> MurmurHash2 is not deprecated. In that case the String methods are
> >>> just
> >>>>>> API bloat but the javadoc should be updated.
> >>>>>>
> >>>>>> Alex
> >>>>>>
> >>>>>
> >>>>> Thank you Alex, please review Javadoc changes in git master.
> >>>>>
> >>>>> Also, if there is anything else that can be improved in code,
> >> Javadoc,
> >>> or
> >>>>> test coverage, please let me know so I can delay creating the release
> >>>>> candidate, which will likely happen tonight (US EST) time permitting.
> >>>>>
> >>>>> Gary
> >>>>>
> >>>>>
> >>>>>>
> >>>>>>> On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> >>>>>>>
> >>>>>>> This is an automated email from the ASF dual-hosted git
> >> repository.
> >>>>>>>
> >>>>>>> ggregory pushed a commit to branch master
> >>>>>>> in repository
> >>> https://gitbox.apache.org/repos/asf/commons-codec.git
> >>>>>>>
> >>>>>>>
> >>>>>>> The following commit(s) were added to refs/heads/master by this
> >>> push:
> >>>>>>>    new f40005a  [CODEC-276] Reliance on default encoding in
> >>>>> MurmurHash2
> >>>>>> and MurmurHash3.
> >>>>>>> f40005a is described below
> >>>>>>>
> >>>>>>> commit f40005ad5a9c892fa8d216b12029a49062224351
> >>>>>>> Author: Gary Gregory <ga...@gmail.com>
> >>>>>>> AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> >>>>>>>
> >>>>>>>   [CODEC-276] Reliance on default encoding in MurmurHash2 and
> >>>>>> MurmurHash3.
> >>>>>>> ---
> >>>>>>> src/changes/changes.xml                            |  1 +
> >>>>>>> .../apache/commons/codec/digest/MurmurHash2.java   | 22
> >>>>>> ++++++++++++++++------
> >>>>>>> .../apache/commons/codec/digest/MurmurHash3.java   | 18
> >>>>>> ++++++++++++++----
> >>>>>>> .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> >>>>>>> 4 files changed, 35 insertions(+), 14 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> >>>>>>> index 55ba445..9e1c3c4 100644
> >>>>>>> --- a/src/changes/changes.xml
> >>>>>>> +++ b/src/changes/changes.xml
> >>>>>>> @@ -55,6 +55,7 @@ The <action> type attribute can be
> >>>>>> add,update,fix,remove.
> >>>>>>>      <action issue="CODEC-273" dev="ggregory" type="add"
> >>>> due-to="Gary
> >>>>>> Gregory">Add Path APIs to
> >> org.apache.commons.codec.digest.DigestUtils
> >>>>>> similar to File APIs.</action>
> >>>>>>>      <action issue="CODEC-274" dev="ggregory" type="add"
> >>>> due-to="Gary
> >>>>>> Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9
> >>> and
> >>>>>> up.</action>
> >>>>>>>      <action issue="CODEC-275" dev="ggregory" type="add"
> >>>>> due-to="Claude
> >>>>>> Warren">Add missing note in javadoc when sign extension error is
> >>>> present
> >>>>>> #34.</action>
> >>>>>>> +      <action issue="CODEC-276" dev="ggregory" type="add"
> >>>> due-to="Gary
> >>>>>> Gregory">Reliance on default encoding in MurmurHash2 and
> >>>>>> MurmurHash3.</action>
> >>>>>>>    </release>
> >>>>>>>
> >>>>>>>    <release version="1.13" date="2019-07-20"
> >> description="Feature
> >>>> and
> >>>>>> fix release.">
> >>>>>>> diff --git
> >>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> >>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> >>>>>>> index 5b0a712..4dfeca9 100644
> >>>>>>> ---
> >>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> >>>>>>> +++
> >>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> >>>>>>> @@ -17,6 +17,9 @@
> >>>>>>>
> >>>>>>> package org.apache.commons.codec.digest;
> >>>>>>>
> >>>>>>> +import java.nio.charset.Charset;
> >>>>>>> +import java.nio.charset.StandardCharsets;
> >>>>>>> +
> >>>>>>> /**
> >>>>>>> * Implementation of the MurmurHash2 32-bit and 64-bit hash
> >>>> functions.
> >>>>>>> *
> >>>>>>> @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> >>>>>>> */
> >>>>>>> public final class MurmurHash2 {
> >>>>>>>
> >>>>>>> +    /**
> >>>>>>> +     * Default Charset used to convert strings into bytes.
> >>>>>>> +     *
> >>>>>>> +     * Consider private; package private for tests only.
> >>>>>>> +     */
> >>>>>>> +    static final Charset GET_BYTES_CHARSET =
> >>> StandardCharsets.UTF_8;
> >>>>>>> +
> >>>>>>>    // Constants for 32-bit variant
> >>>>>>>    private static final int M32 = 0x5bd1e995;
> >>>>>>>    private static final int R32 = 24;
> >>>>>>> @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> >>>>>>>     *
> >>>>>>>     * <pre>
> >>>>>>>     * int seed = 0x9747b28c;
> >>>>>>> -     * byte[] bytes = data.getBytes();
> >>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> >>>>>>>     * @see #hash32(byte[], int, int)
> >>>>>>>     */
> >>>>>>>    public static int hash32(final String text) {
> >>>>>>> -        final byte[] bytes = text.getBytes();
> >>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> >>>>>>>        return hash32(bytes, bytes.length);
> >>>>>>>    }
> >>>>>>>
> >>>>>>> @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> >>>>>>>     *
> >>>>>>>     * <pre>
> >>>>>>>     * int seed = 0x9747b28c;
> >>>>>>> -     * byte[] bytes = text.substring(from, from +
> >>>> length).getBytes();
> >>>>>>> +     * byte[] bytes = text.substring(from, from +
> >>>>>> length).getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> >>>>>>>     *
> >>>>>>>     * <pre>
> >>>>>>>     * int seed = 0xe17a1465;
> >>>>>>> -     * byte[] bytes = data.getBytes();
> >>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> >>>>>>>     * @see #hash64(byte[], int, int)
> >>>>>>>     */
> >>>>>>>    public static long hash64(final String text) {
> >>>>>>> -        final byte[] bytes = text.getBytes();
> >>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> >>>>>>>        return hash64(bytes, bytes.length);
> >>>>>>>    }
> >>>>>>>
> >>>>>>> @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> >>>>>>>     *
> >>>>>>>     * <pre>
> >>>>>>>     * int seed = 0xe17a1465;
> >>>>>>> -     * byte[] bytes = text.substring(from, from +
> >>>> length).getBytes();
> >>>>>>> +     * byte[] bytes = text.substring(from, from +
> >>>>>> length).getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> diff --git
> >>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> >>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> >>>>>>> index cfeb3d1..4509a8f 100644
> >>>>>>> ---
> >>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> >>>>>>> +++
> >>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> >>>>>>> @@ -17,6 +17,9 @@
> >>>>>>>
> >>>>>>> package org.apache.commons.codec.digest;
> >>>>>>>
> >>>>>>> +import java.nio.charset.Charset;
> >>>>>>> +import java.nio.charset.StandardCharsets;
> >>>>>>> +
> >>>>>>> /**
> >>>>>>> * Implementation of the MurmurHash3 32-bit and 128-bit hash
> >>>> functions.
> >>>>>>> *
> >>>>>>> @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> >>>>>>> public final class MurmurHash3 {
> >>>>>>>
> >>>>>>>    /**
> >>>>>>> +     * Default Charset used to convert strings into bytes.
> >>>>>>> +     *
> >>>>>>> +     * Consider private; package private for tests only.
> >>>>>>> +     */
> >>>>>>> +    static final Charset GET_BYTES_CHARSET =
> >>> StandardCharsets.UTF_8;
> >>>>>>> +
> >>>>>>> +    /**
> >>>>>>>     * A random number to use for a hash code.
> >>>>>>>     *
> >>>>>>>     * @deprecated This is not used internally and will be
> >> removed
> >>>> in a
> >>>>>> future release.
> >>>>>>> @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> >>>>>>>     * <pre>
> >>>>>>>     * int offset = 0;
> >>>>>>>     * int seed = 104729;
> >>>>>>> -     * byte[] bytes = data.getBytes();
> >>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
> >>>>> seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> >>>>>>>     */
> >>>>>>>    @Deprecated
> >>>>>>>    public static int hash32(final String data) {
> >>>>>>> -        final byte[] bytes = data.getBytes();
> >>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> >>>>>>>        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> >>>>>>>    }
> >>>>>>>
> >>>>>>> @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> >>>>>>>     * <pre>
> >>>>>>>     * int offset = 0;
> >>>>>>>     * int seed = 104729;
> >>>>>>> -     * byte[] bytes = data.getBytes();
> >>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >>>>>>>     * int hash = MurmurHash3.hash128(bytes, offset,
> >> bytes.length,
> >>>>> seed);
> >>>>>>>     * </pre>
> >>>>>>>     *
> >>>>>>> @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> >>>>>>>     */
> >>>>>>>    @Deprecated
> >>>>>>>    public static long[] hash128(final String data) {
> >>>>>>> -        final byte[] bytes = data.getBytes();
> >>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> >>>>>>>        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> >>>>>>>    }
> >>>>>>>
> >>>>>>> diff --git
> >>>>>>
> >> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> >>>>>>
> >> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> >>>>>>> index 4703f9f..61df7f2 100644
> >>>>>>> ---
> >>>>> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> >>>>>>> +++
> >>>>> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> >>>>>>> @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> >>>>>>>                pos += Character.toChars(codePoint, chars, pos);
> >>>>>>>            }
> >>>>>>>            final String text = String.copyValueOf(chars, 0,
> >> pos);
> >>>>>>> -            final byte[] bytes = text.getBytes();
> >>>>>>> +            final byte[] bytes =
> >>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >>>>>>>            final int h1 = MurmurHash3.hash32(bytes, 0,
> >>> bytes.length,
> >>>>>> seed);
> >>>>>>>            final int h2 = MurmurHash3.hash32(text);
> >>>>>>>            Assert.assertEquals(h1, h2);
> >>>>>>> @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> >>>>>>>     */
> >>>>>>>    @Test
> >>>>>>>    public void testHash64() {
> >>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
> >>>>>>> +        final byte[] origin =
> >>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >>>>>>>        final long hash = MurmurHash3.hash64(origin);
> >>>>>>>        Assert.assertEquals(5785358552565094607L, hash);
> >>>>>>>    }
> >>>>>>> @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> >>>>>>>     */
> >>>>>>>    @Test
> >>>>>>>    public void testHash64WithOffsetAndLength() {
> >>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
> >>>>>>> +        final byte[] origin =
> >>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >>>>>>>        final byte[] originOffset = new byte[origin.length +
> >> 150];
> >>>>>>>        Arrays.fill(originOffset, (byte) 123);
> >>>>>>>        System.arraycopy(origin, 0, originOffset, 150,
> >>>> origin.length);
> >>>>>>> @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> >>>>>>>                pos += Character.toChars(codePoint, chars, pos);
> >>>>>>>            }
> >>>>>>>            final String text = String.copyValueOf(chars, 0,
> >> pos);
> >>>>>>> -            final byte[] bytes = text.getBytes();
> >>>>>>> +            final byte[] bytes =
> >>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >>>>>>>            final long[] h1 = MurmurHash3.hash128(bytes, 0,
> >>>>>> bytes.length, seed);
> >>>>>>>            final long[] h2 = MurmurHash3.hash128(text);
> >>>>>>>            Assert.assertArrayEquals(h1, h2);
> >>>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >> ---------------------------------------------------------------------
> >>>>>> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> >>>>>> For additional commands, e-mail: dev-help@commons.apache.org
>
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Alex Herbert <al...@gmail.com>.

> On 30 Dec 2019, at 14:58, Gary Gregory <ga...@gmail.com> wrote:
> 
> On Mon, Dec 30, 2019 at 9:38 AM Alex Herbert <alex.d.herbert@gmail.com <ma...@gmail.com>>
> wrote:
> 
>> On Mon, 30 Dec 2019, 14:19 Gary Gregory, <ga...@gmail.com> wrote:
>> 
>>> On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <al...@gmail.com>
>>> wrote:
>>> 
>>>> On my second review I fixed the potential overflow in the incremental
>>> hash
>>>> for murmur3.
>>> 
>>> 
>>> I did note that fix flying by in a commit but I would have loved to see
>> it
>>> accompanied by a // comment. Since I am not sure that fix came with a
>> test,
>>> the fix could likely be lost in a future edit by someone wanting to
>> improve
>>> the readability of the code, YMMV on "readability ;-)
>>> 
>>> Please do feel free to check for other edge case like this one.
>>> 
>> 
>> I'll add a better comment later. Sorry.
>> 
>>> 
>> I did not add a test as Travis may not run it. It would require a 2GB array
>> to pass to the incremental. On the BaseNCodecTest I had to test for
>> available memory to run large memory allocation tests using assume to skip
>> it if there was not enough memory.
>> 
>> I'll try a test later and see if it works on Travis. The 3gb limit may be
>> enough to run this one. It will be interesting to see how long it takes to
>> hash a 2gb array.
>> 
>> The base 64 encoding of a 1gb array is a fair chunk of the test suite time
>> if it runs. It requires about 5gb of memory to run.
>> 
>> I have memory and speed improvements for BaseNCodec but not enough time to
>> finish them for this release. That can wait for next year.
>> 
> 
> That all sounds good. We can do a 1.14.1 release in January for performance
> improvements.
> 

I added a test for overflow in MurmurHash3Test. Using the original code this test throws. With the overflow safe length check it is OK. This is all locally with a lot of RAM to run the test.

Travis did not run the test. It gets a ’Skipped 1’ in JDK 8, 11 and 13. I did not check the others. I presume this is due to memory requirements.

I’ve had a scan through the code and this same overflow during incremental update is a problem in:

XXHash32

Other classes implementing Checksum (PureJavaCrc32, PureJavaCrc32C) use a different method to update from a byte array of a given length.

The overflow in XXHash32 can be solved with the same fix used in MurmurHash3 incremental hash. There is also an overflow issue in the getValue() method of XXHash32:

        if (totalLen > BUF_SIZE) {

Here totalLen is being used to store the number of bytes processed. All this check requires is to know that the state array has been updated since the reset with the original seed. If totalLen overflows to negative then the hash will be incorrect as it will not use the entire state array.

To really check this would require a XXHash32 implementation capable of hashing >2^31 bytes to get a reference answer. However a simple change to use a boolean flag to hold the state updated condition would fix the bug and pass all the current tests.

WDYT?


> Gary
> 
> 
>> 
>> 
>> 
>>> Gary
>>> 
>>> 
>>>> Maybe this type of code is present elsewhere in codec for
>>>> other incremental algorithms. Basically you have to guard against
>> someone
>>>> incrementally adding max length arrays which will overflow a small
>> count
>>> of
>>>> unprocessed bytes held as an integer.
>>>> 
>>>> I'm not at the computer now so can't check for other incrementals. I
>> can
>>> do
>>>> this later if you want or you could have a look. It's not a major
>> thing.
>>>> You would have to want to break the code to find this.
>>>> 
>>>> Alex
>>>> 
>>>> 
>>>> 
>>>> 
>>>> On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com>
>> wrote:
>>>> 
>>>>> On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <
>> alex.d.herbert@gmail.com
>>>> 
>>>>> wrote:
>>>>> 
>>>>>> OK.
>>>>>> 
>>>>>> If you are going to make the behaviour change for the string
>> methods
>>>> then
>>>>>> the javadoc should be updated. It currently states:
>>>>>> 
>>>>>> "The string is converted to bytes using the default encoding.”
>>>>>> 
>>>>>> So perhaps switch this to:
>>>>>> 
>>>>>> “Before 1.14 the string was converted using default encoding. Since
>>>> 1.14
>>>>>> the string is converted to bytes using UTF-8 encoding.”
>>>>>> 
>>>>>> The MurmurHash3 methods should still be deprecated as they call the
>>>>>> hash32/hash128 functions that have sign bugs. I deliberately did
>> not
>>>> add
>>>>> a
>>>>>> new method accepting a String for the fixed murmur3 algorithm. IMO
>> it
>>>> is
>>>>>> unwarranted API bloat.
>>>>>> 
>>>>>> MurmurHash2 is not deprecated. In that case the String methods are
>>> just
>>>>>> API bloat but the javadoc should be updated.
>>>>>> 
>>>>>> Alex
>>>>>> 
>>>>> 
>>>>> Thank you Alex, please review Javadoc changes in git master.
>>>>> 
>>>>> Also, if there is anything else that can be improved in code,
>> Javadoc,
>>> or
>>>>> test coverage, please let me know so I can delay creating the release
>>>>> candidate, which will likely happen tonight (US EST) time permitting.
>>>>> 
>>>>> Gary
>>>>> 
>>>>> 
>>>>>> 
>>>>>>> On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
>>>>>>> 
>>>>>>> This is an automated email from the ASF dual-hosted git
>> repository.
>>>>>>> 
>>>>>>> ggregory pushed a commit to branch master
>>>>>>> in repository
>>> https://gitbox.apache.org/repos/asf/commons-codec.git
>>>>>>> 
>>>>>>> 
>>>>>>> The following commit(s) were added to refs/heads/master by this
>>> push:
>>>>>>>    new f40005a  [CODEC-276] Reliance on default encoding in
>>>>> MurmurHash2
>>>>>> and MurmurHash3.
>>>>>>> f40005a is described below
>>>>>>> 
>>>>>>> commit f40005ad5a9c892fa8d216b12029a49062224351
>>>>>>> Author: Gary Gregory <ga...@gmail.com>
>>>>>>> AuthorDate: Sun Dec 29 16:33:14 2019 -0500
>>>>>>> 
>>>>>>>   [CODEC-276] Reliance on default encoding in MurmurHash2 and
>>>>>> MurmurHash3.
>>>>>>> ---
>>>>>>> src/changes/changes.xml                            |  1 +
>>>>>>> .../apache/commons/codec/digest/MurmurHash2.java   | 22
>>>>>> ++++++++++++++++------
>>>>>>> .../apache/commons/codec/digest/MurmurHash3.java   | 18
>>>>>> ++++++++++++++----
>>>>>>> .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
>>>>>>> 4 files changed, 35 insertions(+), 14 deletions(-)
>>>>>>> 
>>>>>>> diff --git a/src/changes/changes.xml b/src/changes/changes.xml
>>>>>>> index 55ba445..9e1c3c4 100644
>>>>>>> --- a/src/changes/changes.xml
>>>>>>> +++ b/src/changes/changes.xml
>>>>>>> @@ -55,6 +55,7 @@ The <action> type attribute can be
>>>>>> add,update,fix,remove.
>>>>>>>      <action issue="CODEC-273" dev="ggregory" type="add"
>>>> due-to="Gary
>>>>>> Gregory">Add Path APIs to
>> org.apache.commons.codec.digest.DigestUtils
>>>>>> similar to File APIs.</action>
>>>>>>>      <action issue="CODEC-274" dev="ggregory" type="add"
>>>> due-to="Gary
>>>>>> Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9
>>> and
>>>>>> up.</action>
>>>>>>>      <action issue="CODEC-275" dev="ggregory" type="add"
>>>>> due-to="Claude
>>>>>> Warren">Add missing note in javadoc when sign extension error is
>>>> present
>>>>>> #34.</action>
>>>>>>> +      <action issue="CODEC-276" dev="ggregory" type="add"
>>>> due-to="Gary
>>>>>> Gregory">Reliance on default encoding in MurmurHash2 and
>>>>>> MurmurHash3.</action>
>>>>>>>    </release>
>>>>>>> 
>>>>>>>    <release version="1.13" date="2019-07-20"
>> description="Feature
>>>> and
>>>>>> fix release.">
>>>>>>> diff --git
>>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>> index 5b0a712..4dfeca9 100644
>>>>>>> ---
>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>> +++
>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
>>>>>>> @@ -17,6 +17,9 @@
>>>>>>> 
>>>>>>> package org.apache.commons.codec.digest;
>>>>>>> 
>>>>>>> +import java.nio.charset.Charset;
>>>>>>> +import java.nio.charset.StandardCharsets;
>>>>>>> +
>>>>>>> /**
>>>>>>> * Implementation of the MurmurHash2 32-bit and 64-bit hash
>>>> functions.
>>>>>>> *
>>>>>>> @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
>>>>>>> */
>>>>>>> public final class MurmurHash2 {
>>>>>>> 
>>>>>>> +    /**
>>>>>>> +     * Default Charset used to convert strings into bytes.
>>>>>>> +     *
>>>>>>> +     * Consider private; package private for tests only.
>>>>>>> +     */
>>>>>>> +    static final Charset GET_BYTES_CHARSET =
>>> StandardCharsets.UTF_8;
>>>>>>> +
>>>>>>>    // Constants for 32-bit variant
>>>>>>>    private static final int M32 = 0x5bd1e995;
>>>>>>>    private static final int R32 = 24;
>>>>>>> @@ -132,7 +142,7 @@ public final class MurmurHash2 {
>>>>>>>     *
>>>>>>>     * <pre>
>>>>>>>     * int seed = 0x9747b28c;
>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> @@ -141,7 +151,7 @@ public final class MurmurHash2 {
>>>>>>>     * @see #hash32(byte[], int, int)
>>>>>>>     */
>>>>>>>    public static int hash32(final String text) {
>>>>>>> -        final byte[] bytes = text.getBytes();
>>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>>>>>>>        return hash32(bytes, bytes.length);
>>>>>>>    }
>>>>>>> 
>>>>>>> @@ -152,7 +162,7 @@ public final class MurmurHash2 {
>>>>>>>     *
>>>>>>>     * <pre>
>>>>>>>     * int seed = 0x9747b28c;
>>>>>>> -     * byte[] bytes = text.substring(from, from +
>>>> length).getBytes();
>>>>>>> +     * byte[] bytes = text.substring(from, from +
>>>>>> length).getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> @@ -243,7 +253,7 @@ public final class MurmurHash2 {
>>>>>>>     *
>>>>>>>     * <pre>
>>>>>>>     * int seed = 0xe17a1465;
>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> @@ -252,7 +262,7 @@ public final class MurmurHash2 {
>>>>>>>     * @see #hash64(byte[], int, int)
>>>>>>>     */
>>>>>>>    public static long hash64(final String text) {
>>>>>>> -        final byte[] bytes = text.getBytes();
>>>>>>> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>>>>>>>        return hash64(bytes, bytes.length);
>>>>>>>    }
>>>>>>> 
>>>>>>> @@ -263,7 +273,7 @@ public final class MurmurHash2 {
>>>>>>>     *
>>>>>>>     * <pre>
>>>>>>>     * int seed = 0xe17a1465;
>>>>>>> -     * byte[] bytes = text.substring(from, from +
>>>> length).getBytes();
>>>>>>> +     * byte[] bytes = text.substring(from, from +
>>>>>> length).getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> diff --git
>>>>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>> index cfeb3d1..4509a8f 100644
>>>>>>> ---
>>> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>> +++
>>> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
>>>>>>> @@ -17,6 +17,9 @@
>>>>>>> 
>>>>>>> package org.apache.commons.codec.digest;
>>>>>>> 
>>>>>>> +import java.nio.charset.Charset;
>>>>>>> +import java.nio.charset.StandardCharsets;
>>>>>>> +
>>>>>>> /**
>>>>>>> * Implementation of the MurmurHash3 32-bit and 128-bit hash
>>>> functions.
>>>>>>> *
>>>>>>> @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
>>>>>>> public final class MurmurHash3 {
>>>>>>> 
>>>>>>>    /**
>>>>>>> +     * Default Charset used to convert strings into bytes.
>>>>>>> +     *
>>>>>>> +     * Consider private; package private for tests only.
>>>>>>> +     */
>>>>>>> +    static final Charset GET_BYTES_CHARSET =
>>> StandardCharsets.UTF_8;
>>>>>>> +
>>>>>>> +    /**
>>>>>>>     * A random number to use for a hash code.
>>>>>>>     *
>>>>>>>     * @deprecated This is not used internally and will be
>> removed
>>>> in a
>>>>>> future release.
>>>>>>> @@ -233,7 +243,7 @@ public final class MurmurHash3 {
>>>>>>>     * <pre>
>>>>>>>     * int offset = 0;
>>>>>>>     * int seed = 104729;
>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
>>>>> seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> @@ -249,7 +259,7 @@ public final class MurmurHash3 {
>>>>>>>     */
>>>>>>>    @Deprecated
>>>>>>>    public static int hash32(final String data) {
>>>>>>> -        final byte[] bytes = data.getBytes();
>>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>>>>>>>        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
>>>>>>>    }
>>>>>>> 
>>>>>>> @@ -751,7 +761,7 @@ public final class MurmurHash3 {
>>>>>>>     * <pre>
>>>>>>>     * int offset = 0;
>>>>>>>     * int seed = 104729;
>>>>>>> -     * byte[] bytes = data.getBytes();
>>>>>>> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>>>>>>>     * int hash = MurmurHash3.hash128(bytes, offset,
>> bytes.length,
>>>>> seed);
>>>>>>>     * </pre>
>>>>>>>     *
>>>>>>> @@ -766,7 +776,7 @@ public final class MurmurHash3 {
>>>>>>>     */
>>>>>>>    @Deprecated
>>>>>>>    public static long[] hash128(final String data) {
>>>>>>> -        final byte[] bytes = data.getBytes();
>>>>>>> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>>>>>>>        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
>>>>>>>    }
>>>>>>> 
>>>>>>> diff --git
>>>>>> 
>> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>> 
>> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>> index 4703f9f..61df7f2 100644
>>>>>>> ---
>>>>> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>> +++
>>>>> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
>>>>>>> @@ -355,7 +355,7 @@ public class MurmurHash3Test {
>>>>>>>                pos += Character.toChars(codePoint, chars, pos);
>>>>>>>            }
>>>>>>>            final String text = String.copyValueOf(chars, 0,
>> pos);
>>>>>>> -            final byte[] bytes = text.getBytes();
>>>>>>> +            final byte[] bytes =
>>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>            final int h1 = MurmurHash3.hash32(bytes, 0,
>>> bytes.length,
>>>>>> seed);
>>>>>>>            final int h2 = MurmurHash3.hash32(text);
>>>>>>>            Assert.assertEquals(h1, h2);
>>>>>>> @@ -455,7 +455,7 @@ public class MurmurHash3Test {
>>>>>>>     */
>>>>>>>    @Test
>>>>>>>    public void testHash64() {
>>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
>>>>>>> +        final byte[] origin =
>>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>        final long hash = MurmurHash3.hash64(origin);
>>>>>>>        Assert.assertEquals(5785358552565094607L, hash);
>>>>>>>    }
>>>>>>> @@ -466,7 +466,7 @@ public class MurmurHash3Test {
>>>>>>>     */
>>>>>>>    @Test
>>>>>>>    public void testHash64WithOffsetAndLength() {
>>>>>>> -        final byte[] origin = TEST_HASH64.getBytes();
>>>>>>> +        final byte[] origin =
>>>>>> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>        final byte[] originOffset = new byte[origin.length +
>> 150];
>>>>>>>        Arrays.fill(originOffset, (byte) 123);
>>>>>>>        System.arraycopy(origin, 0, originOffset, 150,
>>>> origin.length);
>>>>>>> @@ -627,7 +627,7 @@ public class MurmurHash3Test {
>>>>>>>                pos += Character.toChars(codePoint, chars, pos);
>>>>>>>            }
>>>>>>>            final String text = String.copyValueOf(chars, 0,
>> pos);
>>>>>>> -            final byte[] bytes = text.getBytes();
>>>>>>> +            final byte[] bytes =
>>>>>> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>>>>>>>            final long[] h1 = MurmurHash3.hash128(bytes, 0,
>>>>>> bytes.length, seed);
>>>>>>>            final long[] h2 = MurmurHash3.hash128(text);
>>>>>>>            Assert.assertArrayEquals(h1, h2);
>>>>>>> 
>>>>>> 
>>>>>> 
>>>>>> 
>> ---------------------------------------------------------------------
>>>>>> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
>>>>>> For additional commands, e-mail: dev-help@commons.apache.org


Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Gary Gregory <ga...@gmail.com>.
On Mon, Dec 30, 2019 at 9:38 AM Alex Herbert <al...@gmail.com>
wrote:

> On Mon, 30 Dec 2019, 14:19 Gary Gregory, <ga...@gmail.com> wrote:
>
> > On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <al...@gmail.com>
> > wrote:
> >
> > > On my second review I fixed the potential overflow in the incremental
> > hash
> > > for murmur3.
> >
> >
> > I did note that fix flying by in a commit but I would have loved to see
> it
> > accompanied by a // comment. Since I am not sure that fix came with a
> test,
> > the fix could likely be lost in a future edit by someone wanting to
> improve
> > the readability of the code, YMMV on "readability ;-)
> >
> > Please do feel free to check for other edge case like this one.
> >
>
> I'll add a better comment later. Sorry.
>
> >
> I did not add a test as Travis may not run it. It would require a 2GB array
> to pass to the incremental. On the BaseNCodecTest I had to test for
> available memory to run large memory allocation tests using assume to skip
> it if there was not enough memory.
>
> I'll try a test later and see if it works on Travis. The 3gb limit may be
> enough to run this one. It will be interesting to see how long it takes to
> hash a 2gb array.
>
> The base 64 encoding of a 1gb array is a fair chunk of the test suite time
> if it runs. It requires about 5gb of memory to run.
>
> I have memory and speed improvements for BaseNCodec but not enough time to
> finish them for this release. That can wait for next year.
>

That all sounds good. We can do a 1.14.1 release in January for performance
improvements.

Gary


>
>
>
> > Gary
> >
> >
> > > Maybe this type of code is present elsewhere in codec for
> > > other incremental algorithms. Basically you have to guard against
> someone
> > > incrementally adding max length arrays which will overflow a small
> count
> > of
> > > unprocessed bytes held as an integer.
> > >
> > > I'm not at the computer now so can't check for other incrementals. I
> can
> > do
> > > this later if you want or you could have a look. It's not a major
> thing.
> > > You would have to want to break the code to find this.
> > >
> > > Alex
> > >
> > >
> > >
> > >
> > > On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com>
> wrote:
> > >
> > > > On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <
> alex.d.herbert@gmail.com
> > >
> > > > wrote:
> > > >
> > > > > OK.
> > > > >
> > > > > If you are going to make the behaviour change for the string
> methods
> > > then
> > > > > the javadoc should be updated. It currently states:
> > > > >
> > > > > "The string is converted to bytes using the default encoding.”
> > > > >
> > > > > So perhaps switch this to:
> > > > >
> > > > > “Before 1.14 the string was converted using default encoding. Since
> > > 1.14
> > > > > the string is converted to bytes using UTF-8 encoding.”
> > > > >
> > > > > The MurmurHash3 methods should still be deprecated as they call the
> > > > > hash32/hash128 functions that have sign bugs. I deliberately did
> not
> > > add
> > > > a
> > > > > new method accepting a String for the fixed murmur3 algorithm. IMO
> it
> > > is
> > > > > unwarranted API bloat.
> > > > >
> > > > > MurmurHash2 is not deprecated. In that case the String methods are
> > just
> > > > > API bloat but the javadoc should be updated.
> > > > >
> > > > > Alex
> > > > >
> > > >
> > > > Thank you Alex, please review Javadoc changes in git master.
> > > >
> > > > Also, if there is anything else that can be improved in code,
> Javadoc,
> > or
> > > > test coverage, please let me know so I can delay creating the release
> > > > candidate, which will likely happen tonight (US EST) time permitting.
> > > >
> > > > Gary
> > > >
> > > >
> > > > >
> > > > > > On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> > > > > >
> > > > > > This is an automated email from the ASF dual-hosted git
> repository.
> > > > > >
> > > > > > ggregory pushed a commit to branch master
> > > > > > in repository
> > https://gitbox.apache.org/repos/asf/commons-codec.git
> > > > > >
> > > > > >
> > > > > > The following commit(s) were added to refs/heads/master by this
> > push:
> > > > > >     new f40005a  [CODEC-276] Reliance on default encoding in
> > > > MurmurHash2
> > > > > and MurmurHash3.
> > > > > > f40005a is described below
> > > > > >
> > > > > > commit f40005ad5a9c892fa8d216b12029a49062224351
> > > > > > Author: Gary Gregory <ga...@gmail.com>
> > > > > > AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> > > > > >
> > > > > >    [CODEC-276] Reliance on default encoding in MurmurHash2 and
> > > > > MurmurHash3.
> > > > > > ---
> > > > > > src/changes/changes.xml                            |  1 +
> > > > > > .../apache/commons/codec/digest/MurmurHash2.java   | 22
> > > > > ++++++++++++++++------
> > > > > > .../apache/commons/codec/digest/MurmurHash3.java   | 18
> > > > > ++++++++++++++----
> > > > > > .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> > > > > > 4 files changed, 35 insertions(+), 14 deletions(-)
> > > > > >
> > > > > > diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> > > > > > index 55ba445..9e1c3c4 100644
> > > > > > --- a/src/changes/changes.xml
> > > > > > +++ b/src/changes/changes.xml
> > > > > > @@ -55,6 +55,7 @@ The <action> type attribute can be
> > > > > add,update,fix,remove.
> > > > > >       <action issue="CODEC-273" dev="ggregory" type="add"
> > > due-to="Gary
> > > > > Gregory">Add Path APIs to
> org.apache.commons.codec.digest.DigestUtils
> > > > > similar to File APIs.</action>
> > > > > >       <action issue="CODEC-274" dev="ggregory" type="add"
> > > due-to="Gary
> > > > > Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9
> > and
> > > > > up.</action>
> > > > > >       <action issue="CODEC-275" dev="ggregory" type="add"
> > > > due-to="Claude
> > > > > Warren">Add missing note in javadoc when sign extension error is
> > > present
> > > > > #34.</action>
> > > > > > +      <action issue="CODEC-276" dev="ggregory" type="add"
> > > due-to="Gary
> > > > > Gregory">Reliance on default encoding in MurmurHash2 and
> > > > > MurmurHash3.</action>
> > > > > >     </release>
> > > > > >
> > > > > >     <release version="1.13" date="2019-07-20"
> description="Feature
> > > and
> > > > > fix release.">
> > > > > > diff --git
> > > > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > > index 5b0a712..4dfeca9 100644
> > > > > > ---
> > a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > > +++
> > b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > > @@ -17,6 +17,9 @@
> > > > > >
> > > > > > package org.apache.commons.codec.digest;
> > > > > >
> > > > > > +import java.nio.charset.Charset;
> > > > > > +import java.nio.charset.StandardCharsets;
> > > > > > +
> > > > > > /**
> > > > > >  * Implementation of the MurmurHash2 32-bit and 64-bit hash
> > > functions.
> > > > > >  *
> > > > > > @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> > > > > >  */
> > > > > > public final class MurmurHash2 {
> > > > > >
> > > > > > +    /**
> > > > > > +     * Default Charset used to convert strings into bytes.
> > > > > > +     *
> > > > > > +     * Consider private; package private for tests only.
> > > > > > +     */
> > > > > > +    static final Charset GET_BYTES_CHARSET =
> > StandardCharsets.UTF_8;
> > > > > > +
> > > > > >     // Constants for 32-bit variant
> > > > > >     private static final int M32 = 0x5bd1e995;
> > > > > >     private static final int R32 = 24;
> > > > > > @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> > > > > >      *
> > > > > >      * <pre>
> > > > > >      * int seed = 0x9747b28c;
> > > > > > -     * byte[] bytes = data.getBytes();
> > > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> > > > > >      * @see #hash32(byte[], int, int)
> > > > > >      */
> > > > > >     public static int hash32(final String text) {
> > > > > > -        final byte[] bytes = text.getBytes();
> > > > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > > > >         return hash32(bytes, bytes.length);
> > > > > >     }
> > > > > >
> > > > > > @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> > > > > >      *
> > > > > >      * <pre>
> > > > > >      * int seed = 0x9747b28c;
> > > > > > -     * byte[] bytes = text.substring(from, from +
> > > length).getBytes();
> > > > > > +     * byte[] bytes = text.substring(from, from +
> > > > > length).getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> > > > > >      *
> > > > > >      * <pre>
> > > > > >      * int seed = 0xe17a1465;
> > > > > > -     * byte[] bytes = data.getBytes();
> > > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> > > > > >      * @see #hash64(byte[], int, int)
> > > > > >      */
> > > > > >     public static long hash64(final String text) {
> > > > > > -        final byte[] bytes = text.getBytes();
> > > > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > > > >         return hash64(bytes, bytes.length);
> > > > > >     }
> > > > > >
> > > > > > @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> > > > > >      *
> > > > > >      * <pre>
> > > > > >      * int seed = 0xe17a1465;
> > > > > > -     * byte[] bytes = text.substring(from, from +
> > > length).getBytes();
> > > > > > +     * byte[] bytes = text.substring(from, from +
> > > > > length).getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > diff --git
> > > > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > > index cfeb3d1..4509a8f 100644
> > > > > > ---
> > a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > > +++
> > b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > > @@ -17,6 +17,9 @@
> > > > > >
> > > > > > package org.apache.commons.codec.digest;
> > > > > >
> > > > > > +import java.nio.charset.Charset;
> > > > > > +import java.nio.charset.StandardCharsets;
> > > > > > +
> > > > > > /**
> > > > > >  * Implementation of the MurmurHash3 32-bit and 128-bit hash
> > > functions.
> > > > > >  *
> > > > > > @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> > > > > > public final class MurmurHash3 {
> > > > > >
> > > > > >     /**
> > > > > > +     * Default Charset used to convert strings into bytes.
> > > > > > +     *
> > > > > > +     * Consider private; package private for tests only.
> > > > > > +     */
> > > > > > +    static final Charset GET_BYTES_CHARSET =
> > StandardCharsets.UTF_8;
> > > > > > +
> > > > > > +    /**
> > > > > >      * A random number to use for a hash code.
> > > > > >      *
> > > > > >      * @deprecated This is not used internally and will be
> removed
> > > in a
> > > > > future release.
> > > > > > @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> > > > > >      * <pre>
> > > > > >      * int offset = 0;
> > > > > >      * int seed = 104729;
> > > > > > -     * byte[] bytes = data.getBytes();
> > > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
> > > > seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> > > > > >      */
> > > > > >     @Deprecated
> > > > > >     public static int hash32(final String data) {
> > > > > > -        final byte[] bytes = data.getBytes();
> > > > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > > > >         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> > > > > >     }
> > > > > >
> > > > > > @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> > > > > >      * <pre>
> > > > > >      * int offset = 0;
> > > > > >      * int seed = 104729;
> > > > > > -     * byte[] bytes = data.getBytes();
> > > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > > >      * int hash = MurmurHash3.hash128(bytes, offset,
> bytes.length,
> > > > seed);
> > > > > >      * </pre>
> > > > > >      *
> > > > > > @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> > > > > >      */
> > > > > >     @Deprecated
> > > > > >     public static long[] hash128(final String data) {
> > > > > > -        final byte[] bytes = data.getBytes();
> > > > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > > > >         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> > > > > >     }
> > > > > >
> > > > > > diff --git
> > > > >
> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > >
> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > > index 4703f9f..61df7f2 100644
> > > > > > ---
> > > > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > > +++
> > > > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > > @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> > > > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > > > >             }
> > > > > >             final String text = String.copyValueOf(chars, 0,
> pos);
> > > > > > -            final byte[] bytes = text.getBytes();
> > > > > > +            final byte[] bytes =
> > > > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > > >             final int h1 = MurmurHash3.hash32(bytes, 0,
> > bytes.length,
> > > > > seed);
> > > > > >             final int h2 = MurmurHash3.hash32(text);
> > > > > >             Assert.assertEquals(h1, h2);
> > > > > > @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> > > > > >      */
> > > > > >     @Test
> > > > > >     public void testHash64() {
> > > > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > > > +        final byte[] origin =
> > > > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > > >         final long hash = MurmurHash3.hash64(origin);
> > > > > >         Assert.assertEquals(5785358552565094607L, hash);
> > > > > >     }
> > > > > > @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> > > > > >      */
> > > > > >     @Test
> > > > > >     public void testHash64WithOffsetAndLength() {
> > > > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > > > +        final byte[] origin =
> > > > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > > >         final byte[] originOffset = new byte[origin.length +
> 150];
> > > > > >         Arrays.fill(originOffset, (byte) 123);
> > > > > >         System.arraycopy(origin, 0, originOffset, 150,
> > > origin.length);
> > > > > > @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> > > > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > > > >             }
> > > > > >             final String text = String.copyValueOf(chars, 0,
> pos);
> > > > > > -            final byte[] bytes = text.getBytes();
> > > > > > +            final byte[] bytes =
> > > > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > > >             final long[] h1 = MurmurHash3.hash128(bytes, 0,
> > > > > bytes.length, seed);
> > > > > >             final long[] h2 = MurmurHash3.hash128(text);
> > > > > >             Assert.assertArrayEquals(h1, h2);
> > > > > >
> > > > >
> > > > >
> > > > >
> ---------------------------------------------------------------------
> > > > > To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> > > > > For additional commands, e-mail: dev-help@commons.apache.org
> > > > >
> > > > >
> > > >
> > >
> >
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Alex Herbert <al...@gmail.com>.
On Mon, 30 Dec 2019, 14:19 Gary Gregory, <ga...@gmail.com> wrote:

> On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <al...@gmail.com>
> wrote:
>
> > On my second review I fixed the potential overflow in the incremental
> hash
> > for murmur3.
>
>
> I did note that fix flying by in a commit but I would have loved to see it
> accompanied by a // comment. Since I am not sure that fix came with a test,
> the fix could likely be lost in a future edit by someone wanting to improve
> the readability of the code, YMMV on "readability ;-)
>
> Please do feel free to check for other edge case like this one.
>

I'll add a better comment later. Sorry.

>
I did not add a test as Travis may not run it. It would require a 2GB array
to pass to the incremental. On the BaseNCodecTest I had to test for
available memory to run large memory allocation tests using assume to skip
it if there was not enough memory.

I'll try a test later and see if it works on Travis. The 3gb limit may be
enough to run this one. It will be interesting to see how long it takes to
hash a 2gb array.

The base 64 encoding of a 1gb array is a fair chunk of the test suite time
if it runs. It requires about 5gb of memory to run.

I have memory and speed improvements for BaseNCodec but not enough time to
finish them for this release. That can wait for next year.



> Gary
>
>
> > Maybe this type of code is present elsewhere in codec for
> > other incremental algorithms. Basically you have to guard against someone
> > incrementally adding max length arrays which will overflow a small count
> of
> > unprocessed bytes held as an integer.
> >
> > I'm not at the computer now so can't check for other incrementals. I can
> do
> > this later if you want or you could have a look. It's not a major thing.
> > You would have to want to break the code to find this.
> >
> > Alex
> >
> >
> >
> >
> > On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com> wrote:
> >
> > > On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <alex.d.herbert@gmail.com
> >
> > > wrote:
> > >
> > > > OK.
> > > >
> > > > If you are going to make the behaviour change for the string methods
> > then
> > > > the javadoc should be updated. It currently states:
> > > >
> > > > "The string is converted to bytes using the default encoding.”
> > > >
> > > > So perhaps switch this to:
> > > >
> > > > “Before 1.14 the string was converted using default encoding. Since
> > 1.14
> > > > the string is converted to bytes using UTF-8 encoding.”
> > > >
> > > > The MurmurHash3 methods should still be deprecated as they call the
> > > > hash32/hash128 functions that have sign bugs. I deliberately did not
> > add
> > > a
> > > > new method accepting a String for the fixed murmur3 algorithm. IMO it
> > is
> > > > unwarranted API bloat.
> > > >
> > > > MurmurHash2 is not deprecated. In that case the String methods are
> just
> > > > API bloat but the javadoc should be updated.
> > > >
> > > > Alex
> > > >
> > >
> > > Thank you Alex, please review Javadoc changes in git master.
> > >
> > > Also, if there is anything else that can be improved in code, Javadoc,
> or
> > > test coverage, please let me know so I can delay creating the release
> > > candidate, which will likely happen tonight (US EST) time permitting.
> > >
> > > Gary
> > >
> > >
> > > >
> > > > > On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> > > > >
> > > > > This is an automated email from the ASF dual-hosted git repository.
> > > > >
> > > > > ggregory pushed a commit to branch master
> > > > > in repository
> https://gitbox.apache.org/repos/asf/commons-codec.git
> > > > >
> > > > >
> > > > > The following commit(s) were added to refs/heads/master by this
> push:
> > > > >     new f40005a  [CODEC-276] Reliance on default encoding in
> > > MurmurHash2
> > > > and MurmurHash3.
> > > > > f40005a is described below
> > > > >
> > > > > commit f40005ad5a9c892fa8d216b12029a49062224351
> > > > > Author: Gary Gregory <ga...@gmail.com>
> > > > > AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> > > > >
> > > > >    [CODEC-276] Reliance on default encoding in MurmurHash2 and
> > > > MurmurHash3.
> > > > > ---
> > > > > src/changes/changes.xml                            |  1 +
> > > > > .../apache/commons/codec/digest/MurmurHash2.java   | 22
> > > > ++++++++++++++++------
> > > > > .../apache/commons/codec/digest/MurmurHash3.java   | 18
> > > > ++++++++++++++----
> > > > > .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> > > > > 4 files changed, 35 insertions(+), 14 deletions(-)
> > > > >
> > > > > diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> > > > > index 55ba445..9e1c3c4 100644
> > > > > --- a/src/changes/changes.xml
> > > > > +++ b/src/changes/changes.xml
> > > > > @@ -55,6 +55,7 @@ The <action> type attribute can be
> > > > add,update,fix,remove.
> > > > >       <action issue="CODEC-273" dev="ggregory" type="add"
> > due-to="Gary
> > > > Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils
> > > > similar to File APIs.</action>
> > > > >       <action issue="CODEC-274" dev="ggregory" type="add"
> > due-to="Gary
> > > > Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9
> and
> > > > up.</action>
> > > > >       <action issue="CODEC-275" dev="ggregory" type="add"
> > > due-to="Claude
> > > > Warren">Add missing note in javadoc when sign extension error is
> > present
> > > > #34.</action>
> > > > > +      <action issue="CODEC-276" dev="ggregory" type="add"
> > due-to="Gary
> > > > Gregory">Reliance on default encoding in MurmurHash2 and
> > > > MurmurHash3.</action>
> > > > >     </release>
> > > > >
> > > > >     <release version="1.13" date="2019-07-20" description="Feature
> > and
> > > > fix release.">
> > > > > diff --git
> > > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > index 5b0a712..4dfeca9 100644
> > > > > ---
> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > +++
> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > > @@ -17,6 +17,9 @@
> > > > >
> > > > > package org.apache.commons.codec.digest;
> > > > >
> > > > > +import java.nio.charset.Charset;
> > > > > +import java.nio.charset.StandardCharsets;
> > > > > +
> > > > > /**
> > > > >  * Implementation of the MurmurHash2 32-bit and 64-bit hash
> > functions.
> > > > >  *
> > > > > @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> > > > >  */
> > > > > public final class MurmurHash2 {
> > > > >
> > > > > +    /**
> > > > > +     * Default Charset used to convert strings into bytes.
> > > > > +     *
> > > > > +     * Consider private; package private for tests only.
> > > > > +     */
> > > > > +    static final Charset GET_BYTES_CHARSET =
> StandardCharsets.UTF_8;
> > > > > +
> > > > >     // Constants for 32-bit variant
> > > > >     private static final int M32 = 0x5bd1e995;
> > > > >     private static final int R32 = 24;
> > > > > @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> > > > >      *
> > > > >      * <pre>
> > > > >      * int seed = 0x9747b28c;
> > > > > -     * byte[] bytes = data.getBytes();
> > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > > >      * </pre>
> > > > >      *
> > > > > @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> > > > >      * @see #hash32(byte[], int, int)
> > > > >      */
> > > > >     public static int hash32(final String text) {
> > > > > -        final byte[] bytes = text.getBytes();
> > > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > > >         return hash32(bytes, bytes.length);
> > > > >     }
> > > > >
> > > > > @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> > > > >      *
> > > > >      * <pre>
> > > > >      * int seed = 0x9747b28c;
> > > > > -     * byte[] bytes = text.substring(from, from +
> > length).getBytes();
> > > > > +     * byte[] bytes = text.substring(from, from +
> > > > length).getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > > >      * </pre>
> > > > >      *
> > > > > @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> > > > >      *
> > > > >      * <pre>
> > > > >      * int seed = 0xe17a1465;
> > > > > -     * byte[] bytes = data.getBytes();
> > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > > >      * </pre>
> > > > >      *
> > > > > @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> > > > >      * @see #hash64(byte[], int, int)
> > > > >      */
> > > > >     public static long hash64(final String text) {
> > > > > -        final byte[] bytes = text.getBytes();
> > > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > > >         return hash64(bytes, bytes.length);
> > > > >     }
> > > > >
> > > > > @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> > > > >      *
> > > > >      * <pre>
> > > > >      * int seed = 0xe17a1465;
> > > > > -     * byte[] bytes = text.substring(from, from +
> > length).getBytes();
> > > > > +     * byte[] bytes = text.substring(from, from +
> > > > length).getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > > >      * </pre>
> > > > >      *
> > > > > diff --git
> > > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > index cfeb3d1..4509a8f 100644
> > > > > ---
> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > +++
> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > > @@ -17,6 +17,9 @@
> > > > >
> > > > > package org.apache.commons.codec.digest;
> > > > >
> > > > > +import java.nio.charset.Charset;
> > > > > +import java.nio.charset.StandardCharsets;
> > > > > +
> > > > > /**
> > > > >  * Implementation of the MurmurHash3 32-bit and 128-bit hash
> > functions.
> > > > >  *
> > > > > @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> > > > > public final class MurmurHash3 {
> > > > >
> > > > >     /**
> > > > > +     * Default Charset used to convert strings into bytes.
> > > > > +     *
> > > > > +     * Consider private; package private for tests only.
> > > > > +     */
> > > > > +    static final Charset GET_BYTES_CHARSET =
> StandardCharsets.UTF_8;
> > > > > +
> > > > > +    /**
> > > > >      * A random number to use for a hash code.
> > > > >      *
> > > > >      * @deprecated This is not used internally and will be removed
> > in a
> > > > future release.
> > > > > @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> > > > >      * <pre>
> > > > >      * int offset = 0;
> > > > >      * int seed = 104729;
> > > > > -     * byte[] bytes = data.getBytes();
> > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
> > > seed);
> > > > >      * </pre>
> > > > >      *
> > > > > @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> > > > >      */
> > > > >     @Deprecated
> > > > >     public static int hash32(final String data) {
> > > > > -        final byte[] bytes = data.getBytes();
> > > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > > >         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> > > > >     }
> > > > >
> > > > > @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> > > > >      * <pre>
> > > > >      * int offset = 0;
> > > > >      * int seed = 104729;
> > > > > -     * byte[] bytes = data.getBytes();
> > > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > > >      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length,
> > > seed);
> > > > >      * </pre>
> > > > >      *
> > > > > @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> > > > >      */
> > > > >     @Deprecated
> > > > >     public static long[] hash128(final String data) {
> > > > > -        final byte[] bytes = data.getBytes();
> > > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > > >         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> > > > >     }
> > > > >
> > > > > diff --git
> > > > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > index 4703f9f..61df7f2 100644
> > > > > ---
> > > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > +++
> > > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > > @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> > > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > > >             }
> > > > >             final String text = String.copyValueOf(chars, 0, pos);
> > > > > -            final byte[] bytes = text.getBytes();
> > > > > +            final byte[] bytes =
> > > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > >             final int h1 = MurmurHash3.hash32(bytes, 0,
> bytes.length,
> > > > seed);
> > > > >             final int h2 = MurmurHash3.hash32(text);
> > > > >             Assert.assertEquals(h1, h2);
> > > > > @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> > > > >      */
> > > > >     @Test
> > > > >     public void testHash64() {
> > > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > > +        final byte[] origin =
> > > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > >         final long hash = MurmurHash3.hash64(origin);
> > > > >         Assert.assertEquals(5785358552565094607L, hash);
> > > > >     }
> > > > > @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> > > > >      */
> > > > >     @Test
> > > > >     public void testHash64WithOffsetAndLength() {
> > > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > > +        final byte[] origin =
> > > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > >         final byte[] originOffset = new byte[origin.length + 150];
> > > > >         Arrays.fill(originOffset, (byte) 123);
> > > > >         System.arraycopy(origin, 0, originOffset, 150,
> > origin.length);
> > > > > @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> > > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > > >             }
> > > > >             final String text = String.copyValueOf(chars, 0, pos);
> > > > > -            final byte[] bytes = text.getBytes();
> > > > > +            final byte[] bytes =
> > > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > > >             final long[] h1 = MurmurHash3.hash128(bytes, 0,
> > > > bytes.length, seed);
> > > > >             final long[] h2 = MurmurHash3.hash128(text);
> > > > >             Assert.assertArrayEquals(h1, h2);
> > > > >
> > > >
> > > >
> > > > ---------------------------------------------------------------------
> > > > To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> > > > For additional commands, e-mail: dev-help@commons.apache.org
> > > >
> > > >
> > >
> >
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Gary Gregory <ga...@gmail.com>.
On Mon, Dec 30, 2019 at 9:10 AM Alex Herbert <al...@gmail.com>
wrote:

> On my second review I fixed the potential overflow in the incremental hash
> for murmur3.


I did note that fix flying by in a commit but I would have loved to see it
accompanied by a // comment. Since I am not sure that fix came with a test,
the fix could likely be lost in a future edit by someone wanting to improve
the readability of the code, YMMV on "readability ;-)

Please do feel free to check for other edge case like this one.

Gary


> Maybe this type of code is present elsewhere in codec for
> other incremental algorithms. Basically you have to guard against someone
> incrementally adding max length arrays which will overflow a small count of
> unprocessed bytes held as an integer.
>
> I'm not at the computer now so can't check for other incrementals. I can do
> this later if you want or you could have a look. It's not a major thing.
> You would have to want to break the code to find this.
>
> Alex
>
>
>
>
> On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com> wrote:
>
> > On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <al...@gmail.com>
> > wrote:
> >
> > > OK.
> > >
> > > If you are going to make the behaviour change for the string methods
> then
> > > the javadoc should be updated. It currently states:
> > >
> > > "The string is converted to bytes using the default encoding.”
> > >
> > > So perhaps switch this to:
> > >
> > > “Before 1.14 the string was converted using default encoding. Since
> 1.14
> > > the string is converted to bytes using UTF-8 encoding.”
> > >
> > > The MurmurHash3 methods should still be deprecated as they call the
> > > hash32/hash128 functions that have sign bugs. I deliberately did not
> add
> > a
> > > new method accepting a String for the fixed murmur3 algorithm. IMO it
> is
> > > unwarranted API bloat.
> > >
> > > MurmurHash2 is not deprecated. In that case the String methods are just
> > > API bloat but the javadoc should be updated.
> > >
> > > Alex
> > >
> >
> > Thank you Alex, please review Javadoc changes in git master.
> >
> > Also, if there is anything else that can be improved in code, Javadoc, or
> > test coverage, please let me know so I can delay creating the release
> > candidate, which will likely happen tonight (US EST) time permitting.
> >
> > Gary
> >
> >
> > >
> > > > On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> > > >
> > > > This is an automated email from the ASF dual-hosted git repository.
> > > >
> > > > ggregory pushed a commit to branch master
> > > > in repository https://gitbox.apache.org/repos/asf/commons-codec.git
> > > >
> > > >
> > > > The following commit(s) were added to refs/heads/master by this push:
> > > >     new f40005a  [CODEC-276] Reliance on default encoding in
> > MurmurHash2
> > > and MurmurHash3.
> > > > f40005a is described below
> > > >
> > > > commit f40005ad5a9c892fa8d216b12029a49062224351
> > > > Author: Gary Gregory <ga...@gmail.com>
> > > > AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> > > >
> > > >    [CODEC-276] Reliance on default encoding in MurmurHash2 and
> > > MurmurHash3.
> > > > ---
> > > > src/changes/changes.xml                            |  1 +
> > > > .../apache/commons/codec/digest/MurmurHash2.java   | 22
> > > ++++++++++++++++------
> > > > .../apache/commons/codec/digest/MurmurHash3.java   | 18
> > > ++++++++++++++----
> > > > .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> > > > 4 files changed, 35 insertions(+), 14 deletions(-)
> > > >
> > > > diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> > > > index 55ba445..9e1c3c4 100644
> > > > --- a/src/changes/changes.xml
> > > > +++ b/src/changes/changes.xml
> > > > @@ -55,6 +55,7 @@ The <action> type attribute can be
> > > add,update,fix,remove.
> > > >       <action issue="CODEC-273" dev="ggregory" type="add"
> due-to="Gary
> > > Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils
> > > similar to File APIs.</action>
> > > >       <action issue="CODEC-274" dev="ggregory" type="add"
> due-to="Gary
> > > Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9 and
> > > up.</action>
> > > >       <action issue="CODEC-275" dev="ggregory" type="add"
> > due-to="Claude
> > > Warren">Add missing note in javadoc when sign extension error is
> present
> > > #34.</action>
> > > > +      <action issue="CODEC-276" dev="ggregory" type="add"
> due-to="Gary
> > > Gregory">Reliance on default encoding in MurmurHash2 and
> > > MurmurHash3.</action>
> > > >     </release>
> > > >
> > > >     <release version="1.13" date="2019-07-20" description="Feature
> and
> > > fix release.">
> > > > diff --git
> > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > index 5b0a712..4dfeca9 100644
> > > > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > > @@ -17,6 +17,9 @@
> > > >
> > > > package org.apache.commons.codec.digest;
> > > >
> > > > +import java.nio.charset.Charset;
> > > > +import java.nio.charset.StandardCharsets;
> > > > +
> > > > /**
> > > >  * Implementation of the MurmurHash2 32-bit and 64-bit hash
> functions.
> > > >  *
> > > > @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> > > >  */
> > > > public final class MurmurHash2 {
> > > >
> > > > +    /**
> > > > +     * Default Charset used to convert strings into bytes.
> > > > +     *
> > > > +     * Consider private; package private for tests only.
> > > > +     */
> > > > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > > > +
> > > >     // Constants for 32-bit variant
> > > >     private static final int M32 = 0x5bd1e995;
> > > >     private static final int R32 = 24;
> > > > @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> > > >      *
> > > >      * <pre>
> > > >      * int seed = 0x9747b28c;
> > > > -     * byte[] bytes = data.getBytes();
> > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > >      * </pre>
> > > >      *
> > > > @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> > > >      * @see #hash32(byte[], int, int)
> > > >      */
> > > >     public static int hash32(final String text) {
> > > > -        final byte[] bytes = text.getBytes();
> > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > >         return hash32(bytes, bytes.length);
> > > >     }
> > > >
> > > > @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> > > >      *
> > > >      * <pre>
> > > >      * int seed = 0x9747b28c;
> > > > -     * byte[] bytes = text.substring(from, from +
> length).getBytes();
> > > > +     * byte[] bytes = text.substring(from, from +
> > > length).getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > > >      * </pre>
> > > >      *
> > > > @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> > > >      *
> > > >      * <pre>
> > > >      * int seed = 0xe17a1465;
> > > > -     * byte[] bytes = data.getBytes();
> > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > >      * </pre>
> > > >      *
> > > > @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> > > >      * @see #hash64(byte[], int, int)
> > > >      */
> > > >     public static long hash64(final String text) {
> > > > -        final byte[] bytes = text.getBytes();
> > > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > > >         return hash64(bytes, bytes.length);
> > > >     }
> > > >
> > > > @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> > > >      *
> > > >      * <pre>
> > > >      * int seed = 0xe17a1465;
> > > > -     * byte[] bytes = text.substring(from, from +
> length).getBytes();
> > > > +     * byte[] bytes = text.substring(from, from +
> > > length).getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > > >      * </pre>
> > > >      *
> > > > diff --git
> > > a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > index cfeb3d1..4509a8f 100644
> > > > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > > @@ -17,6 +17,9 @@
> > > >
> > > > package org.apache.commons.codec.digest;
> > > >
> > > > +import java.nio.charset.Charset;
> > > > +import java.nio.charset.StandardCharsets;
> > > > +
> > > > /**
> > > >  * Implementation of the MurmurHash3 32-bit and 128-bit hash
> functions.
> > > >  *
> > > > @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> > > > public final class MurmurHash3 {
> > > >
> > > >     /**
> > > > +     * Default Charset used to convert strings into bytes.
> > > > +     *
> > > > +     * Consider private; package private for tests only.
> > > > +     */
> > > > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > > > +
> > > > +    /**
> > > >      * A random number to use for a hash code.
> > > >      *
> > > >      * @deprecated This is not used internally and will be removed
> in a
> > > future release.
> > > > @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> > > >      * <pre>
> > > >      * int offset = 0;
> > > >      * int seed = 104729;
> > > > -     * byte[] bytes = data.getBytes();
> > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
> > seed);
> > > >      * </pre>
> > > >      *
> > > > @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> > > >      */
> > > >     @Deprecated
> > > >     public static int hash32(final String data) {
> > > > -        final byte[] bytes = data.getBytes();
> > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > >         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> > > >     }
> > > >
> > > > @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> > > >      * <pre>
> > > >      * int offset = 0;
> > > >      * int seed = 104729;
> > > > -     * byte[] bytes = data.getBytes();
> > > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > > >      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length,
> > seed);
> > > >      * </pre>
> > > >      *
> > > > @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> > > >      */
> > > >     @Deprecated
> > > >     public static long[] hash128(final String data) {
> > > > -        final byte[] bytes = data.getBytes();
> > > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > > >         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> > > >     }
> > > >
> > > > diff --git
> > > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > index 4703f9f..61df7f2 100644
> > > > ---
> > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > +++
> > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > > @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > >             }
> > > >             final String text = String.copyValueOf(chars, 0, pos);
> > > > -            final byte[] bytes = text.getBytes();
> > > > +            final byte[] bytes =
> > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > >             final int h1 = MurmurHash3.hash32(bytes, 0, bytes.length,
> > > seed);
> > > >             final int h2 = MurmurHash3.hash32(text);
> > > >             Assert.assertEquals(h1, h2);
> > > > @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> > > >      */
> > > >     @Test
> > > >     public void testHash64() {
> > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > +        final byte[] origin =
> > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > >         final long hash = MurmurHash3.hash64(origin);
> > > >         Assert.assertEquals(5785358552565094607L, hash);
> > > >     }
> > > > @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> > > >      */
> > > >     @Test
> > > >     public void testHash64WithOffsetAndLength() {
> > > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > > +        final byte[] origin =
> > > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > >         final byte[] originOffset = new byte[origin.length + 150];
> > > >         Arrays.fill(originOffset, (byte) 123);
> > > >         System.arraycopy(origin, 0, originOffset, 150,
> origin.length);
> > > > @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> > > >                 pos += Character.toChars(codePoint, chars, pos);
> > > >             }
> > > >             final String text = String.copyValueOf(chars, 0, pos);
> > > > -            final byte[] bytes = text.getBytes();
> > > > +            final byte[] bytes =
> > > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > > >             final long[] h1 = MurmurHash3.hash128(bytes, 0,
> > > bytes.length, seed);
> > > >             final long[] h2 = MurmurHash3.hash128(text);
> > > >             Assert.assertArrayEquals(h1, h2);
> > > >
> > >
> > >
> > > ---------------------------------------------------------------------
> > > To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> > > For additional commands, e-mail: dev-help@commons.apache.org
> > >
> > >
> >
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Alex Herbert <al...@gmail.com>.
On my second review I fixed the potential overflow in the incremental hash
for murmur3. Maybe this type of code is present elsewhere in codec for
other incremental algorithms. Basically you have to guard against someone
incrementally adding max length arrays which will overflow a small count of
unprocessed bytes held as an integer.

I'm not at the computer now so can't check for other incrementals. I can do
this later if you want or you could have a look. It's not a major thing.
You would have to want to break the code to find this.

Alex




On Mon, 30 Dec 2019, 14:02 Gary Gregory, <ga...@gmail.com> wrote:

> On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <al...@gmail.com>
> wrote:
>
> > OK.
> >
> > If you are going to make the behaviour change for the string methods then
> > the javadoc should be updated. It currently states:
> >
> > "The string is converted to bytes using the default encoding.”
> >
> > So perhaps switch this to:
> >
> > “Before 1.14 the string was converted using default encoding. Since 1.14
> > the string is converted to bytes using UTF-8 encoding.”
> >
> > The MurmurHash3 methods should still be deprecated as they call the
> > hash32/hash128 functions that have sign bugs. I deliberately did not add
> a
> > new method accepting a String for the fixed murmur3 algorithm. IMO it is
> > unwarranted API bloat.
> >
> > MurmurHash2 is not deprecated. In that case the String methods are just
> > API bloat but the javadoc should be updated.
> >
> > Alex
> >
>
> Thank you Alex, please review Javadoc changes in git master.
>
> Also, if there is anything else that can be improved in code, Javadoc, or
> test coverage, please let me know so I can delay creating the release
> candidate, which will likely happen tonight (US EST) time permitting.
>
> Gary
>
>
> >
> > > On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> > >
> > > This is an automated email from the ASF dual-hosted git repository.
> > >
> > > ggregory pushed a commit to branch master
> > > in repository https://gitbox.apache.org/repos/asf/commons-codec.git
> > >
> > >
> > > The following commit(s) were added to refs/heads/master by this push:
> > >     new f40005a  [CODEC-276] Reliance on default encoding in
> MurmurHash2
> > and MurmurHash3.
> > > f40005a is described below
> > >
> > > commit f40005ad5a9c892fa8d216b12029a49062224351
> > > Author: Gary Gregory <ga...@gmail.com>
> > > AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> > >
> > >    [CODEC-276] Reliance on default encoding in MurmurHash2 and
> > MurmurHash3.
> > > ---
> > > src/changes/changes.xml                            |  1 +
> > > .../apache/commons/codec/digest/MurmurHash2.java   | 22
> > ++++++++++++++++------
> > > .../apache/commons/codec/digest/MurmurHash3.java   | 18
> > ++++++++++++++----
> > > .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> > > 4 files changed, 35 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> > > index 55ba445..9e1c3c4 100644
> > > --- a/src/changes/changes.xml
> > > +++ b/src/changes/changes.xml
> > > @@ -55,6 +55,7 @@ The <action> type attribute can be
> > add,update,fix,remove.
> > >       <action issue="CODEC-273" dev="ggregory" type="add" due-to="Gary
> > Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils
> > similar to File APIs.</action>
> > >       <action issue="CODEC-274" dev="ggregory" type="add" due-to="Gary
> > Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9 and
> > up.</action>
> > >       <action issue="CODEC-275" dev="ggregory" type="add"
> due-to="Claude
> > Warren">Add missing note in javadoc when sign extension error is present
> > #34.</action>
> > > +      <action issue="CODEC-276" dev="ggregory" type="add" due-to="Gary
> > Gregory">Reliance on default encoding in MurmurHash2 and
> > MurmurHash3.</action>
> > >     </release>
> > >
> > >     <release version="1.13" date="2019-07-20" description="Feature and
> > fix release.">
> > > diff --git
> > a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > index 5b0a712..4dfeca9 100644
> > > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > > @@ -17,6 +17,9 @@
> > >
> > > package org.apache.commons.codec.digest;
> > >
> > > +import java.nio.charset.Charset;
> > > +import java.nio.charset.StandardCharsets;
> > > +
> > > /**
> > >  * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
> > >  *
> > > @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> > >  */
> > > public final class MurmurHash2 {
> > >
> > > +    /**
> > > +     * Default Charset used to convert strings into bytes.
> > > +     *
> > > +     * Consider private; package private for tests only.
> > > +     */
> > > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > > +
> > >     // Constants for 32-bit variant
> > >     private static final int M32 = 0x5bd1e995;
> > >     private static final int R32 = 24;
> > > @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> > >      *
> > >      * <pre>
> > >      * int seed = 0x9747b28c;
> > > -     * byte[] bytes = data.getBytes();
> > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > >      * </pre>
> > >      *
> > > @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> > >      * @see #hash32(byte[], int, int)
> > >      */
> > >     public static int hash32(final String text) {
> > > -        final byte[] bytes = text.getBytes();
> > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > >         return hash32(bytes, bytes.length);
> > >     }
> > >
> > > @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> > >      *
> > >      * <pre>
> > >      * int seed = 0x9747b28c;
> > > -     * byte[] bytes = text.substring(from, from + length).getBytes();
> > > +     * byte[] bytes = text.substring(from, from +
> > length).getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> > >      * </pre>
> > >      *
> > > @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> > >      *
> > >      * <pre>
> > >      * int seed = 0xe17a1465;
> > > -     * byte[] bytes = data.getBytes();
> > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > >      * </pre>
> > >      *
> > > @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> > >      * @see #hash64(byte[], int, int)
> > >      */
> > >     public static long hash64(final String text) {
> > > -        final byte[] bytes = text.getBytes();
> > > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> > >         return hash64(bytes, bytes.length);
> > >     }
> > >
> > > @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> > >      *
> > >      * <pre>
> > >      * int seed = 0xe17a1465;
> > > -     * byte[] bytes = text.substring(from, from + length).getBytes();
> > > +     * byte[] bytes = text.substring(from, from +
> > length).getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> > >      * </pre>
> > >      *
> > > diff --git
> > a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > index cfeb3d1..4509a8f 100644
> > > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > > @@ -17,6 +17,9 @@
> > >
> > > package org.apache.commons.codec.digest;
> > >
> > > +import java.nio.charset.Charset;
> > > +import java.nio.charset.StandardCharsets;
> > > +
> > > /**
> > >  * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
> > >  *
> > > @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> > > public final class MurmurHash3 {
> > >
> > >     /**
> > > +     * Default Charset used to convert strings into bytes.
> > > +     *
> > > +     * Consider private; package private for tests only.
> > > +     */
> > > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > > +
> > > +    /**
> > >      * A random number to use for a hash code.
> > >      *
> > >      * @deprecated This is not used internally and will be removed in a
> > future release.
> > > @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> > >      * <pre>
> > >      * int offset = 0;
> > >      * int seed = 104729;
> > > -     * byte[] bytes = data.getBytes();
> > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length,
> seed);
> > >      * </pre>
> > >      *
> > > @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> > >      */
> > >     @Deprecated
> > >     public static int hash32(final String data) {
> > > -        final byte[] bytes = data.getBytes();
> > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > >         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> > >     }
> > >
> > > @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> > >      * <pre>
> > >      * int offset = 0;
> > >      * int seed = 104729;
> > > -     * byte[] bytes = data.getBytes();
> > > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> > >      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length,
> seed);
> > >      * </pre>
> > >      *
> > > @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> > >      */
> > >     @Deprecated
> > >     public static long[] hash128(final String data) {
> > > -        final byte[] bytes = data.getBytes();
> > > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> > >         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> > >     }
> > >
> > > diff --git
> > a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > index 4703f9f..61df7f2 100644
> > > ---
> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > +++
> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > > @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> > >                 pos += Character.toChars(codePoint, chars, pos);
> > >             }
> > >             final String text = String.copyValueOf(chars, 0, pos);
> > > -            final byte[] bytes = text.getBytes();
> > > +            final byte[] bytes =
> > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > >             final int h1 = MurmurHash3.hash32(bytes, 0, bytes.length,
> > seed);
> > >             final int h2 = MurmurHash3.hash32(text);
> > >             Assert.assertEquals(h1, h2);
> > > @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> > >      */
> > >     @Test
> > >     public void testHash64() {
> > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > +        final byte[] origin =
> > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > >         final long hash = MurmurHash3.hash64(origin);
> > >         Assert.assertEquals(5785358552565094607L, hash);
> > >     }
> > > @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> > >      */
> > >     @Test
> > >     public void testHash64WithOffsetAndLength() {
> > > -        final byte[] origin = TEST_HASH64.getBytes();
> > > +        final byte[] origin =
> > TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > >         final byte[] originOffset = new byte[origin.length + 150];
> > >         Arrays.fill(originOffset, (byte) 123);
> > >         System.arraycopy(origin, 0, originOffset, 150, origin.length);
> > > @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> > >                 pos += Character.toChars(codePoint, chars, pos);
> > >             }
> > >             final String text = String.copyValueOf(chars, 0, pos);
> > > -            final byte[] bytes = text.getBytes();
> > > +            final byte[] bytes =
> > text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> > >             final long[] h1 = MurmurHash3.hash128(bytes, 0,
> > bytes.length, seed);
> > >             final long[] h2 = MurmurHash3.hash128(text);
> > >             Assert.assertArrayEquals(h1, h2);
> > >
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> > For additional commands, e-mail: dev-help@commons.apache.org
> >
> >
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Gary Gregory <ga...@gmail.com>.
On Sun, Dec 29, 2019 at 5:46 PM Alex Herbert <al...@gmail.com>
wrote:

> OK.
>
> If you are going to make the behaviour change for the string methods then
> the javadoc should be updated. It currently states:
>
> "The string is converted to bytes using the default encoding.”
>
> So perhaps switch this to:
>
> “Before 1.14 the string was converted using default encoding. Since 1.14
> the string is converted to bytes using UTF-8 encoding.”
>
> The MurmurHash3 methods should still be deprecated as they call the
> hash32/hash128 functions that have sign bugs. I deliberately did not add a
> new method accepting a String for the fixed murmur3 algorithm. IMO it is
> unwarranted API bloat.
>
> MurmurHash2 is not deprecated. In that case the String methods are just
> API bloat but the javadoc should be updated.
>
> Alex
>

Thank you Alex, please review Javadoc changes in git master.

Also, if there is anything else that can be improved in code, Javadoc, or
test coverage, please let me know so I can delay creating the release
candidate, which will likely happen tonight (US EST) time permitting.

Gary


>
> > On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> >
> > This is an automated email from the ASF dual-hosted git repository.
> >
> > ggregory pushed a commit to branch master
> > in repository https://gitbox.apache.org/repos/asf/commons-codec.git
> >
> >
> > The following commit(s) were added to refs/heads/master by this push:
> >     new f40005a  [CODEC-276] Reliance on default encoding in MurmurHash2
> and MurmurHash3.
> > f40005a is described below
> >
> > commit f40005ad5a9c892fa8d216b12029a49062224351
> > Author: Gary Gregory <ga...@gmail.com>
> > AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> >
> >    [CODEC-276] Reliance on default encoding in MurmurHash2 and
> MurmurHash3.
> > ---
> > src/changes/changes.xml                            |  1 +
> > .../apache/commons/codec/digest/MurmurHash2.java   | 22
> ++++++++++++++++------
> > .../apache/commons/codec/digest/MurmurHash3.java   | 18
> ++++++++++++++----
> > .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> > 4 files changed, 35 insertions(+), 14 deletions(-)
> >
> > diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> > index 55ba445..9e1c3c4 100644
> > --- a/src/changes/changes.xml
> > +++ b/src/changes/changes.xml
> > @@ -55,6 +55,7 @@ The <action> type attribute can be
> add,update,fix,remove.
> >       <action issue="CODEC-273" dev="ggregory" type="add" due-to="Gary
> Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils
> similar to File APIs.</action>
> >       <action issue="CODEC-274" dev="ggregory" type="add" due-to="Gary
> Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9 and
> up.</action>
> >       <action issue="CODEC-275" dev="ggregory" type="add" due-to="Claude
> Warren">Add missing note in javadoc when sign extension error is present
> #34.</action>
> > +      <action issue="CODEC-276" dev="ggregory" type="add" due-to="Gary
> Gregory">Reliance on default encoding in MurmurHash2 and
> MurmurHash3.</action>
> >     </release>
> >
> >     <release version="1.13" date="2019-07-20" description="Feature and
> fix release.">
> > diff --git
> a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > index 5b0a712..4dfeca9 100644
> > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> > @@ -17,6 +17,9 @@
> >
> > package org.apache.commons.codec.digest;
> >
> > +import java.nio.charset.Charset;
> > +import java.nio.charset.StandardCharsets;
> > +
> > /**
> >  * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
> >  *
> > @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
> >  */
> > public final class MurmurHash2 {
> >
> > +    /**
> > +     * Default Charset used to convert strings into bytes.
> > +     *
> > +     * Consider private; package private for tests only.
> > +     */
> > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > +
> >     // Constants for 32-bit variant
> >     private static final int M32 = 0x5bd1e995;
> >     private static final int R32 = 24;
> > @@ -132,7 +142,7 @@ public final class MurmurHash2 {
> >      *
> >      * <pre>
> >      * int seed = 0x9747b28c;
> > -     * byte[] bytes = data.getBytes();
> > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> >      * </pre>
> >      *
> > @@ -141,7 +151,7 @@ public final class MurmurHash2 {
> >      * @see #hash32(byte[], int, int)
> >      */
> >     public static int hash32(final String text) {
> > -        final byte[] bytes = text.getBytes();
> > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> >         return hash32(bytes, bytes.length);
> >     }
> >
> > @@ -152,7 +162,7 @@ public final class MurmurHash2 {
> >      *
> >      * <pre>
> >      * int seed = 0x9747b28c;
> > -     * byte[] bytes = text.substring(from, from + length).getBytes();
> > +     * byte[] bytes = text.substring(from, from +
> length).getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
> >      * </pre>
> >      *
> > @@ -243,7 +253,7 @@ public final class MurmurHash2 {
> >      *
> >      * <pre>
> >      * int seed = 0xe17a1465;
> > -     * byte[] bytes = data.getBytes();
> > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> >      * </pre>
> >      *
> > @@ -252,7 +262,7 @@ public final class MurmurHash2 {
> >      * @see #hash64(byte[], int, int)
> >      */
> >     public static long hash64(final String text) {
> > -        final byte[] bytes = text.getBytes();
> > +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
> >         return hash64(bytes, bytes.length);
> >     }
> >
> > @@ -263,7 +273,7 @@ public final class MurmurHash2 {
> >      *
> >      * <pre>
> >      * int seed = 0xe17a1465;
> > -     * byte[] bytes = text.substring(from, from + length).getBytes();
> > +     * byte[] bytes = text.substring(from, from +
> length).getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
> >      * </pre>
> >      *
> > diff --git
> a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > index cfeb3d1..4509a8f 100644
> > --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> > @@ -17,6 +17,9 @@
> >
> > package org.apache.commons.codec.digest;
> >
> > +import java.nio.charset.Charset;
> > +import java.nio.charset.StandardCharsets;
> > +
> > /**
> >  * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
> >  *
> > @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> > public final class MurmurHash3 {
> >
> >     /**
> > +     * Default Charset used to convert strings into bytes.
> > +     *
> > +     * Consider private; package private for tests only.
> > +     */
> > +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> > +
> > +    /**
> >      * A random number to use for a hash code.
> >      *
> >      * @deprecated This is not used internally and will be removed in a
> future release.
> > @@ -233,7 +243,7 @@ public final class MurmurHash3 {
> >      * <pre>
> >      * int offset = 0;
> >      * int seed = 104729;
> > -     * byte[] bytes = data.getBytes();
> > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
> >      * </pre>
> >      *
> > @@ -249,7 +259,7 @@ public final class MurmurHash3 {
> >      */
> >     @Deprecated
> >     public static int hash32(final String data) {
> > -        final byte[] bytes = data.getBytes();
> > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> >         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
> >     }
> >
> > @@ -751,7 +761,7 @@ public final class MurmurHash3 {
> >      * <pre>
> >      * int offset = 0;
> >      * int seed = 104729;
> > -     * byte[] bytes = data.getBytes();
> > +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
> >      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
> >      * </pre>
> >      *
> > @@ -766,7 +776,7 @@ public final class MurmurHash3 {
> >      */
> >     @Deprecated
> >     public static long[] hash128(final String data) {
> > -        final byte[] bytes = data.getBytes();
> > +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
> >         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
> >     }
> >
> > diff --git
> a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > index 4703f9f..61df7f2 100644
> > --- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > +++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> > @@ -355,7 +355,7 @@ public class MurmurHash3Test {
> >                 pos += Character.toChars(codePoint, chars, pos);
> >             }
> >             final String text = String.copyValueOf(chars, 0, pos);
> > -            final byte[] bytes = text.getBytes();
> > +            final byte[] bytes =
> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >             final int h1 = MurmurHash3.hash32(bytes, 0, bytes.length,
> seed);
> >             final int h2 = MurmurHash3.hash32(text);
> >             Assert.assertEquals(h1, h2);
> > @@ -455,7 +455,7 @@ public class MurmurHash3Test {
> >      */
> >     @Test
> >     public void testHash64() {
> > -        final byte[] origin = TEST_HASH64.getBytes();
> > +        final byte[] origin =
> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >         final long hash = MurmurHash3.hash64(origin);
> >         Assert.assertEquals(5785358552565094607L, hash);
> >     }
> > @@ -466,7 +466,7 @@ public class MurmurHash3Test {
> >      */
> >     @Test
> >     public void testHash64WithOffsetAndLength() {
> > -        final byte[] origin = TEST_HASH64.getBytes();
> > +        final byte[] origin =
> TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >         final byte[] originOffset = new byte[origin.length + 150];
> >         Arrays.fill(originOffset, (byte) 123);
> >         System.arraycopy(origin, 0, originOffset, 150, origin.length);
> > @@ -627,7 +627,7 @@ public class MurmurHash3Test {
> >                 pos += Character.toChars(codePoint, chars, pos);
> >             }
> >             final String text = String.copyValueOf(chars, 0, pos);
> > -            final byte[] bytes = text.getBytes();
> > +            final byte[] bytes =
> text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
> >             final long[] h1 = MurmurHash3.hash128(bytes, 0,
> bytes.length, seed);
> >             final long[] h2 = MurmurHash3.hash128(text);
> >             Assert.assertArrayEquals(h1, h2);
> >
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> For additional commands, e-mail: dev-help@commons.apache.org
>
>

Re: [commons-codec] branch master updated: [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.

Posted by Alex Herbert <al...@gmail.com>.
OK.

If you are going to make the behaviour change for the string methods then the javadoc should be updated. It currently states:

"The string is converted to bytes using the default encoding.”

So perhaps switch this to:

“Before 1.14 the string was converted using default encoding. Since 1.14 the string is converted to bytes using UTF-8 encoding.”

The MurmurHash3 methods should still be deprecated as they call the hash32/hash128 functions that have sign bugs. I deliberately did not add a new method accepting a String for the fixed murmur3 algorithm. IMO it is unwarranted API bloat.

MurmurHash2 is not deprecated. In that case the String methods are just API bloat but the javadoc should be updated.

Alex


> On 29 Dec 2019, at 21:33, ggregory@apache.org wrote:
> 
> This is an automated email from the ASF dual-hosted git repository.
> 
> ggregory pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/commons-codec.git
> 
> 
> The following commit(s) were added to refs/heads/master by this push:
>     new f40005a  [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.
> f40005a is described below
> 
> commit f40005ad5a9c892fa8d216b12029a49062224351
> Author: Gary Gregory <ga...@gmail.com>
> AuthorDate: Sun Dec 29 16:33:14 2019 -0500
> 
>    [CODEC-276] Reliance on default encoding in MurmurHash2 and MurmurHash3.
> ---
> src/changes/changes.xml                            |  1 +
> .../apache/commons/codec/digest/MurmurHash2.java   | 22 ++++++++++++++++------
> .../apache/commons/codec/digest/MurmurHash3.java   | 18 ++++++++++++++----
> .../commons/codec/digest/MurmurHash3Test.java      |  8 ++++----
> 4 files changed, 35 insertions(+), 14 deletions(-)
> 
> diff --git a/src/changes/changes.xml b/src/changes/changes.xml
> index 55ba445..9e1c3c4 100644
> --- a/src/changes/changes.xml
> +++ b/src/changes/changes.xml
> @@ -55,6 +55,7 @@ The <action> type attribute can be add,update,fix,remove.
>       <action issue="CODEC-273" dev="ggregory" type="add" due-to="Gary Gregory">Add Path APIs to org.apache.commons.codec.digest.DigestUtils similar to File APIs.</action>
>       <action issue="CODEC-274" dev="ggregory" type="add" due-to="Gary Gregory">Add SHA-512/224 and SHA-512/256 to DigestUtils for Java 9 and up.</action>
>       <action issue="CODEC-275" dev="ggregory" type="add" due-to="Claude Warren">Add missing note in javadoc when sign extension error is present #34.</action>
> +      <action issue="CODEC-276" dev="ggregory" type="add" due-to="Gary Gregory">Reliance on default encoding in MurmurHash2 and MurmurHash3.</action>
>     </release>
> 
>     <release version="1.13" date="2019-07-20" description="Feature and fix release.">
> diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> index 5b0a712..4dfeca9 100644
> --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
> @@ -17,6 +17,9 @@
> 
> package org.apache.commons.codec.digest;
> 
> +import java.nio.charset.Charset;
> +import java.nio.charset.StandardCharsets;
> +
> /**
>  * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
>  *
> @@ -48,6 +51,13 @@ package org.apache.commons.codec.digest;
>  */
> public final class MurmurHash2 {
> 
> +    /**
> +     * Default Charset used to convert strings into bytes.
> +     * 
> +     * Consider private; package private for tests only.
> +     */
> +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> +
>     // Constants for 32-bit variant
>     private static final int M32 = 0x5bd1e995;
>     private static final int R32 = 24;
> @@ -132,7 +142,7 @@ public final class MurmurHash2 {
>      *
>      * <pre>
>      * int seed = 0x9747b28c;
> -     * byte[] bytes = data.getBytes();
> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>      * </pre>
>      *
> @@ -141,7 +151,7 @@ public final class MurmurHash2 {
>      * @see #hash32(byte[], int, int)
>      */
>     public static int hash32(final String text) {
> -        final byte[] bytes = text.getBytes();
> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>         return hash32(bytes, bytes.length);
>     }
> 
> @@ -152,7 +162,7 @@ public final class MurmurHash2 {
>      *
>      * <pre>
>      * int seed = 0x9747b28c;
> -     * byte[] bytes = text.substring(from, from + length).getBytes();
> +     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
>      * </pre>
>      *
> @@ -243,7 +253,7 @@ public final class MurmurHash2 {
>      *
>      * <pre>
>      * int seed = 0xe17a1465;
> -     * byte[] bytes = data.getBytes();
> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>      * </pre>
>      *
> @@ -252,7 +262,7 @@ public final class MurmurHash2 {
>      * @see #hash64(byte[], int, int)
>      */
>     public static long hash64(final String text) {
> -        final byte[] bytes = text.getBytes();
> +        final byte[] bytes = text.getBytes(GET_BYTES_CHARSET);
>         return hash64(bytes, bytes.length);
>     }
> 
> @@ -263,7 +273,7 @@ public final class MurmurHash2 {
>      *
>      * <pre>
>      * int seed = 0xe17a1465;
> -     * byte[] bytes = text.substring(from, from + length).getBytes();
> +     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
>      * </pre>
>      *
> diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> index cfeb3d1..4509a8f 100644
> --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
> @@ -17,6 +17,9 @@
> 
> package org.apache.commons.codec.digest;
> 
> +import java.nio.charset.Charset;
> +import java.nio.charset.StandardCharsets;
> +
> /**
>  * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
>  *
> @@ -56,6 +59,13 @@ package org.apache.commons.codec.digest;
> public final class MurmurHash3 {
> 
>     /**
> +     * Default Charset used to convert strings into bytes.
> +     * 
> +     * Consider private; package private for tests only.
> +     */
> +    static final Charset GET_BYTES_CHARSET = StandardCharsets.UTF_8;
> +
> +    /**
>      * A random number to use for a hash code.
>      *
>      * @deprecated This is not used internally and will be removed in a future release.
> @@ -233,7 +243,7 @@ public final class MurmurHash3 {
>      * <pre>
>      * int offset = 0;
>      * int seed = 104729;
> -     * byte[] bytes = data.getBytes();
> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
>      * </pre>
>      *
> @@ -249,7 +259,7 @@ public final class MurmurHash3 {
>      */
>     @Deprecated
>     public static int hash32(final String data) {
> -        final byte[] bytes = data.getBytes();
> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>         return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
>     }
> 
> @@ -751,7 +761,7 @@ public final class MurmurHash3 {
>      * <pre>
>      * int offset = 0;
>      * int seed = 104729;
> -     * byte[] bytes = data.getBytes();
> +     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
>      * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
>      * </pre>
>      *
> @@ -766,7 +776,7 @@ public final class MurmurHash3 {
>      */
>     @Deprecated
>     public static long[] hash128(final String data) {
> -        final byte[] bytes = data.getBytes();
> +        final byte[] bytes = data.getBytes(GET_BYTES_CHARSET);
>         return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
>     }
> 
> diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> index 4703f9f..61df7f2 100644
> --- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> +++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
> @@ -355,7 +355,7 @@ public class MurmurHash3Test {
>                 pos += Character.toChars(codePoint, chars, pos);
>             }
>             final String text = String.copyValueOf(chars, 0, pos);
> -            final byte[] bytes = text.getBytes();
> +            final byte[] bytes = text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>             final int h1 = MurmurHash3.hash32(bytes, 0, bytes.length, seed);
>             final int h2 = MurmurHash3.hash32(text);
>             Assert.assertEquals(h1, h2);
> @@ -455,7 +455,7 @@ public class MurmurHash3Test {
>      */
>     @Test
>     public void testHash64() {
> -        final byte[] origin = TEST_HASH64.getBytes();
> +        final byte[] origin = TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>         final long hash = MurmurHash3.hash64(origin);
>         Assert.assertEquals(5785358552565094607L, hash);
>     }
> @@ -466,7 +466,7 @@ public class MurmurHash3Test {
>      */
>     @Test
>     public void testHash64WithOffsetAndLength() {
> -        final byte[] origin = TEST_HASH64.getBytes();
> +        final byte[] origin = TEST_HASH64.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>         final byte[] originOffset = new byte[origin.length + 150];
>         Arrays.fill(originOffset, (byte) 123);
>         System.arraycopy(origin, 0, originOffset, 150, origin.length);
> @@ -627,7 +627,7 @@ public class MurmurHash3Test {
>                 pos += Character.toChars(codePoint, chars, pos);
>             }
>             final String text = String.copyValueOf(chars, 0, pos);
> -            final byte[] bytes = text.getBytes();
> +            final byte[] bytes = text.getBytes(MurmurHash3.GET_BYTES_CHARSET);
>             final long[] h1 = MurmurHash3.hash128(bytes, 0, bytes.length, seed);
>             final long[] h2 = MurmurHash3.hash128(text);
>             Assert.assertArrayEquals(h1, h2);
> 


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
For additional commands, e-mail: dev-help@commons.apache.org