You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by th...@apache.org on 2011/12/22 11:02:13 UTC
svn commit: r1222141 - in /jackrabbit/sandbox/microkernel/src:
main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java
test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
Author: thomasm
Date: Thu Dec 22 10:02:12 2011
New Revision: 1222141
URL: http://svn.apache.org/viewvc?rev=1222141&view=rev
Log:
Improved bloom filter hash formula (short names are more common than long ones).
Modified:
jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java
jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
Modified: jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java (original)
+++ jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java Thu Dec 22 10:02:12 2011
@@ -24,7 +24,7 @@ public class BloomFilterUtils {
/**
* The multiply and shift constants for the supplemental hash function.
*/
- private static final int MUL = 257, SHIFT = 8;
+ private static final int MUL = 2153, SHIFT = 19;
/**
* The number of bits needed per stored element.
Modified: jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java (original)
+++ jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java Thu Dec 22 10:02:12 2011
@@ -62,10 +62,10 @@ public class MemoryNodeTest {
n = n.cloneAndAddChildNode("c", false, null, c, 3);
Assert.assertEquals("{\"a\":n1,\"b\":n2,\"c\":n3}", n.asString());
n = n.cloneAndAddChildNode("d", false, null, d, 4);
- Assert.assertEquals("{\":children\":n5,\":names\":\"0e\",\":children\":n6,\":names\":\"10\",\":childCount\":4}",
+ Assert.assertEquals("{\":children\":n5,\":names\":\"0e00\",\":children\":n6,\":names\":\"1000\",\":childCount\":4}",
n.asString());
n2 = NodeImpl.fromString(map, n.asString());
- Assert.assertEquals("{\":children\":n5,\":names\":\"0e\",\":children\":n6,\":names\":\"10\",\":childCount\":4}",
+ Assert.assertEquals("{\":children\":n5,\":names\":\"0e00\",\":children\":n6,\":names\":\"1000\",\":childCount\":4}",
n2.asString());
Assert.assertTrue(n2.exists("a"));
Assert.assertTrue(n2.exists("b"));
Modified: jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java (original)
+++ jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java Thu Dec 22 10:02:12 2011
@@ -19,6 +19,8 @@ package org.apache.jackrabbit.mk.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.math.BigInteger;
+import java.util.HashSet;
+import java.util.Random;
import org.junit.Test;
/**
@@ -30,29 +32,43 @@ public class BloomFilterUtilsTest {
* Program to calculate the best shift and multiply constants.
*/
public static void main(String... args) {
+ Random random = new Random(1);
+ HashSet<String> inSet = new HashSet<String>();
+ while (inSet.size() < 100) {
+ inSet.add(randomString(random));
+ }
+ Object[] in = inSet.toArray();
+ HashSet<String> notSet = new HashSet<String>();
+ while (notSet.size() < 10000) {
+ String k = randomString(random);
+ if (!inSet.contains(k)) {
+ notSet.add(k);
+ }
+ }
+ Object[] not = notSet.toArray();
int best = Integer.MAX_VALUE;
- for (int mul = 1; mul < 100000; mul++) {
+ for (int mul = 1; mul < 100000; mul += 2) {
if (!BigInteger.valueOf(mul).isProbablePrime(10)) {
continue;
}
for (int shift = 0; shift < 32; shift++) {
- byte[] bloom = BloomFilterUtils.createFilter(20, 64);
- for (int i = 0; i < 20; i++) {
- String k = String.valueOf(i);
+ byte[] bloom = BloomFilterUtils.createFilter(100, 64);
+ for (Object k : in) {
int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
add(bloom, h1, h2);
}
- for (int i = 0; i < 20; i++) {
- String k = String.valueOf(i);
- int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
- assertTrue(probablyContains(bloom, h1, h2));
- }
int falsePositives = 0;
- for (int i = 20; i < 100000; i++) {
- String k = String.valueOf(i);
+ for (Object k : not) {
int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
if (probablyContains(bloom, h1, h2)) {
falsePositives++;
+ // short false positives are bad
+ if (k.toString().length() < 4) {
+ falsePositives += 5;
+ }
+ if (falsePositives > best) {
+ break;
+ }
}
}
if (falsePositives < best) {
@@ -64,7 +80,64 @@ public class BloomFilterUtilsTest {
}
}
+ private static String randomString(Random r) {
+ if (r.nextInt(5) == 0) {
+ return randomName(r);
+ }
+ int length = 1 + Math.abs((int) r.nextGaussian() * 5);
+ if (r.nextBoolean()) {
+ length += r.nextInt(10);
+ }
+ char[] chars = new char[length];
+ for (int i = 0; i < length; i++) {
+ chars[i] = randomChar(r);
+ }
+ return new String(chars);
+ }
+
+ private static char randomChar(Random r) {
+ switch (r.nextInt(101) / 100) {
+ case 0:
+ case 1:
+ // 20% ascii
+ return (char) (32 + r.nextInt(127 - 32));
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ // 40% a-z
+ return (char) ('a' + r.nextInt('z' - 'a'));
+ case 6:
+ // 10% A-Z
+ return (char) ('A' + r.nextInt('Z' - 'A'));
+ case 7:
+ case 8:
+ // 20% 0-9
+ return (char) ('0' + r.nextInt('9' - '0'));
+ case 9:
+ // 10% aeiou
+ return "aeiou".charAt(r.nextInt("aeiou".length()));
+ }
+ // 1% unicode
+ return (char) r.nextInt(65535);
+ }
+
+ private static String randomName(Random r) {
+ int i = r.nextInt(1000);
+ // like TPC-C lastName, but lowercase
+ String[] n = {
+ "bar", "ought", "able", "pri", "pres", "ese", "anti",
+ "cally", "ation", "eing" };
+ StringBuilder buff = new StringBuilder();
+ buff.append(n[i / 100]);
+ buff.append(n[(i / 10) % 10]);
+ buff.append(n[i % 10]);
+ return buff.toString();
+ }
+
+
private static int hash(int oldHash, int mul, int shift) {
+ oldHash = oldHash ^ (int) (oldHash * 0x5DEECE66DL + 0xBL);
return oldHash ^ ((oldHash * mul) >> shift);
}
@@ -115,7 +188,7 @@ public class BloomFilterUtilsTest {
falsePositives++;
}
}
- assertEquals(113, falsePositives);
+ assertEquals(1101, falsePositives);
}
@Test
public void negativeHashCode() {