You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2012/08/01 00:12:33 UTC
svn commit: r1367802 - in /lucene/dev/branches/branch_4x: ./ dev-tools/
lucene/ lucene/analysis/
lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/
lucene/benchmark/ lucene/core/ lucene/demo/ lucene/facet/ lucene/grouping/
luce...
Author: yonik
Date: Tue Jul 31 22:12:31 2012
New Revision: 1367802
URL: http://svn.apache.org/viewvc?rev=1367802&view=rev
Log:
SOLR-3154: add murmurhash3 that can work directly on a string
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/BUILD.txt (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt (props changed)
lucene/dev/branches/branch_4x/lucene/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/MIGRATE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/README.txt (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java (props changed)
lucene/dev/branches/branch_4x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/common-build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/demo/ (props changed)
lucene/dev/branches/branch_4x/lucene/facet/ (props changed)
lucene/dev/branches/branch_4x/lucene/grouping/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/ivy-settings.xml (props changed)
lucene/dev/branches/branch_4x/lucene/join/ (props changed)
lucene/dev/branches/branch_4x/lucene/licenses/ (props changed)
lucene/dev/branches/branch_4x/lucene/memory/ (props changed)
lucene/dev/branches/branch_4x/lucene/misc/ (props changed)
lucene/dev/branches/branch_4x/lucene/module-build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/queries/ (props changed)
lucene/dev/branches/branch_4x/lucene/queryparser/ (props changed)
lucene/dev/branches/branch_4x/lucene/sandbox/ (props changed)
lucene/dev/branches/branch_4x/lucene/site/ (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/tools/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/solr/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/solr/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/README.txt (props changed)
lucene/dev/branches/branch_4x/solr/build.xml (props changed)
lucene/dev/branches/branch_4x/solr/cloud-dev/ (props changed)
lucene/dev/branches/branch_4x/solr/common-build.xml (props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/solr/example/ (props changed)
lucene/dev/branches/branch_4x/solr/lib/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/scripts/ (props changed)
lucene/dev/branches/branch_4x/solr/solrj/ (props changed)
lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java
lucene/dev/branches/branch_4x/solr/test-framework/ (props changed)
lucene/dev/branches/branch_4x/solr/testlogging.properties (props changed)
lucene/dev/branches/branch_4x/solr/webapp/ (props changed)
Modified: lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java?rev=1367802&r1=1367801&r2=1367802&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java (original)
+++ lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/util/Hash.java Tue Jul 31 22:12:31 2012
@@ -291,4 +291,132 @@ public class Hash {
return h1;
}
+
+
+ /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
+ * the string to a temporary buffer. This is more than 2x faster than hashing the result
+ * of String.getBytes().
+ */
+ public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
+
+ final int c1 = 0xcc9e2d51;
+ final int c2 = 0x1b873593;
+
+ int h1 = seed;
+
+ int pos = offset;
+ int end = offset + len;
+ int k1 = 0;
+ int k2 = 0;
+ int shift = 0;
+ int bits = 0;
+ int nBytes = 0; // length in UTF8 bytes
+
+
+ while (pos < end) {
+ int code = data.charAt(pos++);
+ if (code < 0x80) {
+ k2 = code;
+ bits = 8;
+
+ /***
+ // optimized ascii implementation (currently slower!!! code size?)
+ if (shift == 24) {
+ k1 = k1 | (code << 24);
+
+ k1 *= c1;
+ k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+
+ shift = 0;
+ nBytes += 4;
+ k1 = 0;
+ } else {
+ k1 |= code << shift;
+ shift += 8;
+ }
+ continue;
+ ***/
+
+ }
+ else if (code < 0x800) {
+ k2 = (0xC0 | (code >> 6))
+ | ((0x80 | (code & 0x3F)) << 8);
+ bits = 16;
+ }
+ else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
+ // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
+ k2 = (0xE0 | (code >> 12))
+ | ((0x80 | ((code >> 6) & 0x3F)) << 8)
+ | ((0x80 | (code & 0x3F)) << 16);
+ bits = 24;
+ } else {
+ // surrogate pair
+ // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
+ int utf32 = (int) data.charAt(pos++);
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ k2 = (0xff & (0xF0 | (utf32 >> 18)))
+ | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
+ | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
+ | (0x80 | (utf32 & 0x3F)) << 24;
+ bits = 32;
+ }
+
+
+ k1 |= k2 << shift;
+
+ // int used_bits = 32 - shift; // how many bits of k2 were used in k1.
+ // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift
+
+ shift += bits;
+ if (shift >= 32) {
+ // mix after we have a complete word
+
+ k1 *= c1;
+ k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+
+ shift -= 32;
+ // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
+ if (shift != 0) {
+ k1 = k2 >>> (bits-shift); // bits used == bits - newshift
+ } else {
+ k1 = 0;
+ }
+ nBytes += 4;
+ }
+
+ } // inner
+
+ // handle tail
+ if (shift > 0) {
+ nBytes += shift >> 3;
+ k1 *= c1;
+ k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // finalization
+ h1 ^= nBytes;
+
+ // fmix(h1);
+ h1 ^= h1 >>> 16;
+ h1 *= 0x85ebca6b;
+ h1 ^= h1 >>> 13;
+ h1 *= 0xc2b2ae35;
+ h1 ^= h1 >>> 16;
+
+ return h1;
+ }
+
+
}