You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2012/08/01 00:12:33 UTC

svn commit: r1367802 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ lucene/ lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/ lucene/benchmark/ lucene/core/ lucene/demo/ lucene/facet/ lucene/grouping/ luce...

Author: yonik
Date: Tue Jul 31 22:12:31 2012
New Revision: 1367802

URL: http://svn.apache.org/viewvc?rev=1367802&view=rev
Log:
SOLR-3154: add murmurhash3 that can work directly on a string

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/README.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/demo/   (props changed)
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/grouping/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/join/   (props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/lucene/memory/   (props changed)
    lucene/dev/branches/branch_4x/lucene/misc/   (props changed)
    lucene/dev/branches/branch_4x/lucene/module-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/queries/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_4x/lucene/sandbox/   (props changed)
    lucene/dev/branches/branch_4x/lucene/site/   (props changed)
    lucene/dev/branches/branch_4x/lucene/spatial/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/tools/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/README.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/cloud-dev/   (props changed)
    lucene/dev/branches/branch_4x/solr/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/solr/example/   (props changed)
    lucene/dev/branches/branch_4x/solr/lib/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/scripts/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java
    lucene/dev/branches/branch_4x/solr/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/solr/testlogging.properties   (props changed)
    lucene/dev/branches/branch_4x/solr/webapp/   (props changed)

Modified: lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java?rev=1367802&r1=1367801&r2=1367802&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java (original)
+++ lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java Tue Jul 31 22:12:31 2012
@@ -291,4 +291,132 @@ public class Hash {
     return h1;
   }
 
+
+
+  /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
+   * the string to a temporary buffer.  This is more than 2x faster than hashing the result
+   * of String.getBytes().
+   */
+  public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
+
+    final int c1 = 0xcc9e2d51;
+    final int c2 = 0x1b873593;
+
+    int h1 = seed;
+
+    int pos = offset;
+    int end = offset + len;
+    int k1 = 0;
+    int k2 = 0;
+    int shift = 0;
+    int bits = 0;
+    int nBytes = 0;   // length in UTF8 bytes
+
+
+    while (pos < end) {
+      int code = data.charAt(pos++);
+      if (code < 0x80) {
+        k2 = code;
+        bits = 8;
+
+        /***
+         // optimized ascii implementation (currently slower!!! code size?)
+         if (shift == 24) {
+         k1 = k1 | (code << 24);
+
+         k1 *= c1;
+         k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
+         k1 *= c2;
+
+         h1 ^= k1;
+         h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
+         h1 = h1*5+0xe6546b64;
+
+         shift = 0;
+         nBytes += 4;
+         k1 = 0;
+         } else {
+         k1 |= code << shift;
+         shift += 8;
+         }
+         continue;
+         ***/
+
+      }
+      else if (code < 0x800) {
+        k2 = (0xC0 | (code >> 6))
+            | ((0x80 | (code & 0x3F)) << 8);
+        bits = 16;
+      }
+      else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
+        // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
+        k2 = (0xE0 | (code >> 12))
+            | ((0x80 | ((code >> 6) & 0x3F)) << 8)
+            | ((0x80 | (code & 0x3F)) << 16);
+        bits = 24;
+      } else {
+        // surrogate pair
+        // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
+        int utf32 = (int) data.charAt(pos++);
+        utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+        k2 = (0xff & (0xF0 | (utf32 >> 18)))
+            | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
+            | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
+            |  (0x80 | (utf32 & 0x3F)) << 24;
+        bits = 32;
+      }
+
+
+      k1 |= k2 << shift;
+
+      // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.
+      // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift
+
+      shift += bits;
+      if (shift >= 32) {
+        // mix after we have a complete word
+
+        k1 *= c1;
+        k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
+        k1 *= c2;
+
+        h1 ^= k1;
+        h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
+        h1 = h1*5+0xe6546b64;
+
+        shift -= 32;
+        // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
+        if (shift != 0) {
+          k1 = k2 >>> (bits-shift);   // bits used == bits - newshift
+        } else {
+          k1 = 0;
+        }
+        nBytes += 4;
+      }
+
+    } // inner
+
+    // handle tail
+    if (shift > 0) {
+      nBytes += shift >> 3;
+      k1 *= c1;
+      k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
+      k1 *= c2;
+      h1 ^= k1;
+    }
+
+    // finalization
+    h1 ^= nBytes;
+
+    // fmix(h1);
+    h1 ^= h1 >>> 16;
+    h1 *= 0x85ebca6b;
+    h1 ^= h1 >>> 13;
+    h1 *= 0xc2b2ae35;
+    h1 ^= h1 >>> 16;
+
+    return h1;
+  }
+
+
 }