You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/11/21 17:00:46 UTC

[commons-codec] branch master updated (91cdc16 -> a1fb848)

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git.


    from 91cdc16  Close javadoc <pre> and </p> tags.
     new 48b2331  Fix MurmurHash2 code in-line with MurmurHash3.
     new a1fb848  MurmurHashTest2: Use Assert.equals and conditional Assert.fail.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/commons/codec/digest/MurmurHash2.java   | 174 ++++++++++++++++-----
 .../apache/commons/codec/digest/MurmurHash3.java   |   3 +-
 .../commons/codec/digest/MurmurHash2Test.java      |  33 ++--
 3 files changed, 154 insertions(+), 56 deletions(-)


[commons-codec] 02/02: MurmurHashTest2: Use Assert.equals and conditional Assert.fail.

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git

commit a1fb8486705ff6cea89b0670036c0574d407c339
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 17:00:42 2019 +0000

    MurmurHashTest2: Use Assert.equals and conditional Assert.fail.
---
 .../commons/codec/digest/MurmurHash2Test.java      | 33 +++++++++++-----------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash2Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash2Test.java
index a121909..9aee349 100644
--- a/src/test/java/org/apache/commons/codec/digest/MurmurHash2Test.java
+++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash2Test.java
@@ -17,9 +17,7 @@
 
 package org.apache.commons.codec.digest;
 
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
+import org.junit.Assert;
 import org.junit.Test;
 
 public class MurmurHash2Test {
@@ -85,8 +83,8 @@ public class MurmurHash2Test {
         for (int i = 0; i < input.length; i++) {
             final int hash = MurmurHash2.hash32(input[i], input[i].length, 0x71b4954d);
             if (hash != results32_seed[i]) {
-                fail(String.format("Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x", i, hash,
-                        results32_seed[i]));
+                Assert.fail(String.format("Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x", i, hash,
+                            results32_seed[i]));
             }
         }
     }
@@ -96,8 +94,8 @@ public class MurmurHash2Test {
         for (int i = 0; i < input.length; i++) {
             final int hash = MurmurHash2.hash32(input[i], input[i].length);
             if (hash != results32_standard[i]) {
-                fail(String.format("Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x", i, hash,
-                        results32_standard[i]));
+                Assert.fail(String.format("Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x", i, hash,
+                            results32_standard[i]));
             }
         }
     }
@@ -105,21 +103,23 @@ public class MurmurHash2Test {
     @Test
     public void testHash32String() {
         final int hash = MurmurHash2.hash32(text);
-        assertTrue(hash == 0xb3bf597e);
+        Assert.assertEquals(0xb3bf597e, hash);
     }
 
     @Test
     public void testHash32StringIntInt() {
         final int hash = MurmurHash2.hash32(text, 2, text.length() - 4);
-        assertTrue(hash == 0x4d666d90);
+        Assert.assertEquals(0x4d666d90, hash);
     }
 
     @Test
     public void testHash64ByteArrayIntInt() {
         for (int i = 0; i < input.length; i++) {
             final long hash = MurmurHash2.hash64(input[i], input[i].length, 0x344d1f5c);
-            assertTrue(String.format("Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x", i, hash,
-                    results64_seed[i]), hash == results64_seed[i]);
+            if (hash != results64_seed[i]) {
+                Assert.fail(String.format("Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x", i, hash,
+                            results64_seed[i]));
+            }
         }
     }
 
@@ -127,21 +127,22 @@ public class MurmurHash2Test {
     public void testHash64ByteArrayInt() {
         for (int i = 0; i < input.length; i++) {
             final long hash = MurmurHash2.hash64(input[i], input[i].length);
-            assertTrue(String.format("Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x", i, hash,
-                    results64_standard[i]), hash == results64_standard[i]);
+            if (hash != results64_standard[i]) {
+                Assert.fail(String.format("Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x", i, hash,
+                    results64_standard[i]));
+            }
         }
     }
 
     @Test
     public void testHash64String() {
         final long hash = MurmurHash2.hash64(text);
-        assertTrue(hash == 0x0920e0c1b7eeb261l);
+        Assert.assertEquals(0x0920e0c1b7eeb261L, hash);
     }
 
     @Test
     public void testHash64StringIntInt() {
         final long hash = MurmurHash2.hash64(text, 2, text.length() - 4);
-        assertTrue(hash == 0xa8b33145194985a2l);
+        Assert.assertEquals(0xa8b33145194985a2L, hash);
     }
-
 }


[commons-codec] 01/02: Fix MurmurHash2 code in-line with MurmurHash3.

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git

commit 48b23313e7c7428fc421942352c8dba4d1e35200
Author: aherbert <ah...@apache.org>
AuthorDate: Thu Nov 21 16:54:34 2019 +0000

    Fix MurmurHash2 code in-line with MurmurHash3.
    
    This updates the documentation and code so the two can be compared.
    
    No binary functionality is changed.
---
 .../apache/commons/codec/digest/MurmurHash2.java   | 174 ++++++++++++++++-----
 .../apache/commons/codec/digest/MurmurHash3.java   |   3 +-
 2 files changed, 137 insertions(+), 40 deletions(-)

diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
index 0e07bfe..e349ff9 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
@@ -18,30 +18,42 @@
 package org.apache.commons.codec.digest;
 
 /**
- * MurmurHash2 yields a 32-bit or 64-bit value.
+ * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
  *
- * MurmurHash is a non-cryptographic hash function suitable for general
+ * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  * it is not specifically designed to be difficult to reverse by an adversary,
- * making it unsuitable for cryptographic purposes.
+ * making it unsuitable for cryptographic purposes.</p>
  *
- * This is a re-implementation of the original C code plus some additional
- * features.
+ * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
+ * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's
+ * original {@code c++} code in SMHasher.</p>
  *
- * Public domain.
+ * <p>This is a re-implementation of the original C code plus some additional
+ * features.</p>
+ *
+ * <p>This is public domain code with no copyrights. From homepage of
+ * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
+ *
+ * <blockquote>
+ * "All MurmurHash versions are public domain software, and the author
+ * disclaims all copyright to their code."
+ * </blockquote>
  *
  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
+ * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
+ *   Original MurmurHash2 c++ code</a>
  * @since 1.13
  */
 public final class MurmurHash2 {
 
-    // all methods static; private constructor.
+    /** No instance methods. */
     private MurmurHash2() {
     }
 
     /**
-     * Generates 32 bit hash from byte array of the given length and seed.
+     * Generates 32 bit hash from byte array with the given length and seed.
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
@@ -56,12 +68,14 @@ public final class MurmurHash2 {
 
         // Initialize the hash to a random value
         int h = seed ^ length;
-        final int length4 = length / 4;
 
-        for (int i = 0; i < length4; i++) {
-            final int i4 = i * 4;
-            int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16)
-                    + ((data[i4 + 3] & 0xff) << 24);
+        // Mix 4 bytes at a time into the hash
+        final int nblocks = length >> 2;
+
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = (i << 2);
+            int k = getLittleEndianInt(data, index);
             k *= m;
             k ^= k >>> r;
             k *= m;
@@ -70,16 +84,19 @@ public final class MurmurHash2 {
         }
 
         // Handle the last few bytes of the input array
-        switch (length % 4) {
+        final int index = (nblocks << 2);
+        switch (length - index) {
         case 3:
-            h ^= (data[(length & ~3) + 2] & 0xff) << 16;
+            h ^= (data[index + 2] & 0xff) << 16;
         case 2:
-            h ^= (data[(length & ~3) + 1] & 0xff) << 8;
+            h ^= (data[index + 1] & 0xff) << 8;
         case 1:
-            h ^= (data[length & ~3] & 0xff);
+            h ^= (data[index] & 0xff);
             h *= m;
         }
 
+        // Do a few final mixes of the hash to ensure the last few
+        // bytes are well-incorporated.
         h ^= h >>> 13;
         h *= m;
         h ^= h >>> 15;
@@ -88,21 +105,37 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 32 bit hash from byte array with default seed value.
+     * Generates 32 bit hash from byte array with the given length and a default seed value.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * int hash = MurmurHash2.hash32(data, offset, length, seed);
+     * </pre>
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
      * @return 32 bit hash of the given array
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final byte[] data, final int length) {
         return hash32(data, length, 0x9747b28c);
     }
 
     /**
-     * Generates 32 bit hash from a string.
+     * Generates 32 bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * byte[] bytes = data.getBytes();
+     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text string to hash
      * @return 32 bit hash of the given string
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text) {
         final byte[] bytes = text.getBytes();
@@ -110,12 +143,21 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 32 bit hash from a substring.
+     * Generates 32 bit hash from a substring with a default seed value.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text   string to hash
      * @param from   starting index
      * @param length length of the substring to hash
      * @return 32 bit hash of the given string
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text, final int from, final int length) {
         return hash32(text.substring(from, from + length));
@@ -135,14 +177,12 @@ public final class MurmurHash2 {
 
         long h = (seed & 0xffffffffl) ^ (length * m);
 
-        final int length8 = length / 8;
+        final int nblocks = length >> 3;
 
-        for (int i = 0; i < length8; i++) {
-            final int i8 = i * 8;
-            long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8)
-                    + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24)
-                    + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40)
-                    + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56);
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = (i << 3);
+            long k = getLittleEndianLong(data, index);
 
             k *= m;
             k ^= k >>> r;
@@ -152,21 +192,22 @@ public final class MurmurHash2 {
             h *= m;
         }
 
-        switch (length % 8) {
+        final int index = (nblocks << 3);
+        switch (length - index) {
         case 7:
-            h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48;
+            h ^= ((long) data[index + 6] & 0xff) << 48;
         case 6:
-            h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40;
+            h ^= ((long) data[index + 5] & 0xff) << 40;
         case 5:
-            h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32;
+            h ^= ((long) data[index + 4] & 0xff) << 32;
         case 4:
-            h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24;
+            h ^= ((long) data[index + 3] & 0xff) << 24;
         case 3:
-            h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16;
+            h ^= ((long) data[index + 2] & 0xff) << 16;
         case 2:
-            h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8;
+            h ^= ((long) data[index + 1] & 0xff) << 8;
         case 1:
-            h ^= data[length & ~7] & 0xff;
+            h ^= ((long) data[index] & 0xff);
             h *= m;
         }
 
@@ -178,21 +219,37 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 64 bit hash from byte array with default seed value.
+     * Generates 64 bit hash from byte array with given length and a default seed value.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * int hash = MurmurHash2.hash64(data, offset, length, seed);
+     * </pre>
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
      * @return 64 bit hash of the given string
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final byte[] data, final int length) {
         return hash64(data, length, 0xe17a1465);
     }
 
     /**
-     * Generates 64 bit hash from a string.
+     * Generates 64 bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * byte[] bytes = data.getBytes();
+     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text string to hash
      * @return 64 bit hash of the given string
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text) {
         final byte[] bytes = text.getBytes();
@@ -200,14 +257,55 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 64 bit hash from a substring.
+     * Generates 64 bit hash from a substring with a default seed value.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text   string to hash
      * @param from   starting index
      * @param length length of the substring to hash
      * @return 64 bit hash of the given array
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text, final int from, final int length) {
         return hash64(text.substring(from, from + length));
     }
+
+    /**
+     * Gets the little-endian int from 4 bytes starting at the specified index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian int
+     */
+    private static int getLittleEndianInt(final byte[] data, final int index) {
+        return ((data[index    ] & 0xff)      ) |
+               ((data[index + 1] & 0xff) <<  8) |
+               ((data[index + 2] & 0xff) << 16) |
+               ((data[index + 3] & 0xff) << 24);
+    }
+
+    /**
+     * Gets the little-endian long from 8 bytes starting at the specified index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian long
+     */
+    private static long getLittleEndianLong(final byte[] data, final int index) {
+        return (((long) data[index    ] & 0xff)      ) |
+               (((long) data[index + 1] & 0xff) <<  8) |
+               (((long) data[index + 2] & 0xff) << 16) |
+               (((long) data[index + 3] & 0xff) << 24) |
+               (((long) data[index + 4] & 0xff) << 32) |
+               (((long) data[index + 5] & 0xff) << 40) |
+               (((long) data[index + 6] & 0xff) << 48) |
+               (((long) data[index + 7] & 0xff) << 56);
+    }
 }
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index 1fda9ac..513ace8 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -18,7 +18,7 @@
 package org.apache.commons.codec.digest;
 
 /**
- * MurmurHash3 yields a 32-bit or 128-bit value.
+ * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
  *
  * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
@@ -860,7 +860,6 @@ public final class MurmurHash3 {
         return new long[] { h1, h2 };
     }
 
-
     /**
      * Gets the little-endian long from 8 bytes starting at the specified index.
      *