You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by th...@apache.org on 2011/12/22 11:02:13 UTC

svn commit: r1222141 - in /jackrabbit/sandbox/microkernel/src: main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java

Author: thomasm
Date: Thu Dec 22 10:02:12 2011
New Revision: 1222141

URL: http://svn.apache.org/viewvc?rev=1222141&view=rev
Log:
Improved bloom filter hash formula (short names are more common than long ones).

Modified:
    jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
    jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java
    jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java

Modified: jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java (original)
+++ jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java Thu Dec 22 10:02:12 2011
@@ -24,7 +24,7 @@ public class BloomFilterUtils {
     /**
      * The multiply and shift constants for the supplemental hash function.
      */
-    private static final int MUL = 257, SHIFT = 8;
+    private static final int MUL = 2153, SHIFT = 19;
 
     /**
      * The number of bits needed per stored element.

Modified: jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java (original)
+++ jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/mem/MemoryNodeTest.java Thu Dec 22 10:02:12 2011
@@ -62,10 +62,10 @@ public class MemoryNodeTest {
         n = n.cloneAndAddChildNode("c", false, null, c, 3);
         Assert.assertEquals("{\"a\":n1,\"b\":n2,\"c\":n3}", n.asString());
         n = n.cloneAndAddChildNode("d", false, null, d, 4);
-        Assert.assertEquals("{\":children\":n5,\":names\":\"0e\",\":children\":n6,\":names\":\"10\",\":childCount\":4}",
+        Assert.assertEquals("{\":children\":n5,\":names\":\"0e00\",\":children\":n6,\":names\":\"1000\",\":childCount\":4}",
                 n.asString());
         n2 = NodeImpl.fromString(map, n.asString());
-        Assert.assertEquals("{\":children\":n5,\":names\":\"0e\",\":children\":n6,\":names\":\"10\",\":childCount\":4}",
+        Assert.assertEquals("{\":children\":n5,\":names\":\"0e00\",\":children\":n6,\":names\":\"1000\",\":childCount\":4}",
                 n2.asString());
         Assert.assertTrue(n2.exists("a"));
         Assert.assertTrue(n2.exists("b"));

Modified: jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java?rev=1222141&r1=1222140&r2=1222141&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java (original)
+++ jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java Thu Dec 22 10:02:12 2011
@@ -19,6 +19,8 @@ package org.apache.jackrabbit.mk.util;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import java.math.BigInteger;
+import java.util.HashSet;
+import java.util.Random;
 import org.junit.Test;
 
 /**
@@ -30,29 +32,43 @@ public class BloomFilterUtilsTest {
      * Program to calculate the best shift and multiply constants.
      */
     public static void main(String... args) {
+        Random random = new Random(1);
+        HashSet<String> inSet = new HashSet<String>();
+        while (inSet.size() < 100) {
+            inSet.add(randomString(random));
+        }
+        Object[] in = inSet.toArray();
+        HashSet<String> notSet = new HashSet<String>();
+        while (notSet.size() < 10000) {
+            String k = randomString(random);
+            if (!inSet.contains(k)) {
+                notSet.add(k);
+            }
+        }
+        Object[] not = notSet.toArray();
         int best = Integer.MAX_VALUE;
-        for (int mul = 1; mul < 100000; mul++) {
+        for (int mul = 1; mul < 100000; mul += 2) {
             if (!BigInteger.valueOf(mul).isProbablePrime(10)) {
                 continue;
             }
             for (int shift = 0; shift < 32; shift++) {
-                byte[] bloom = BloomFilterUtils.createFilter(20, 64);
-                for (int i = 0; i < 20; i++) {
-                    String k = String.valueOf(i);
+                byte[] bloom = BloomFilterUtils.createFilter(100, 64);
+                for (Object k : in) {
                     int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
                     add(bloom, h1, h2);
                 }
-                for (int i = 0; i < 20; i++) {
-                    String k = String.valueOf(i);
-                    int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
-                    assertTrue(probablyContains(bloom, h1, h2));
-                }
                 int falsePositives = 0;
-                for (int i = 20; i < 100000; i++) {
-                    String k = String.valueOf(i);
+                for (Object k : not) {
                     int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
                     if (probablyContains(bloom, h1, h2)) {
                         falsePositives++;
+                        // short false positives are bad
+                        if (k.toString().length() < 4) {
+                            falsePositives += 5;
+                        }
+                        if (falsePositives > best) {
+                            break;
+                        }
                     }
                 }
                 if (falsePositives < best) {
@@ -64,7 +80,64 @@ public class BloomFilterUtilsTest {
         }
     }
 
+    private static String randomString(Random r) {
+        if (r.nextInt(5) == 0) {
+            return randomName(r);
+        }
+        int length = 1 + Math.abs((int) r.nextGaussian() * 5);
+        if (r.nextBoolean()) {
+            length += r.nextInt(10);
+        }
+        char[] chars = new char[length];
+        for (int i = 0; i < length; i++) {
+            chars[i] = randomChar(r);
+        }
+        return new String(chars);
+    }
+
+    private static char randomChar(Random r) {
+        switch (r.nextInt(101) / 100) {
+        case 0:
+        case 1:
+            // 20% ascii
+            return (char) (32 + r.nextInt(127 - 32));
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+            // 40% a-z
+            return (char) ('a' + r.nextInt('z' - 'a'));
+        case 6:
+            // 10% A-Z
+            return (char) ('A' + r.nextInt('Z' - 'A'));
+        case 7:
+        case 8:
+            // 20% 0-9
+            return (char) ('0' + r.nextInt('9' - '0'));
+        case 9:
+            // 10% aeiou
+            return "aeiou".charAt(r.nextInt("aeiou".length()));
+        }
+        // 1% unicode
+        return (char) r.nextInt(65535);
+    }
+
+    private static String randomName(Random r) {
+        int i = r.nextInt(1000);
+        // like TPC-C lastName, but lowercase
+        String[] n = {
+                "bar", "ought", "able", "pri", "pres", "ese", "anti",
+                "cally", "ation", "eing" };
+        StringBuilder buff = new StringBuilder();
+        buff.append(n[i / 100]);
+        buff.append(n[(i / 10) % 10]);
+        buff.append(n[i % 10]);
+        return buff.toString();
+    }
+
+
     private static int hash(int oldHash, int mul, int shift) {
+        oldHash = oldHash ^ (int) (oldHash * 0x5DEECE66DL + 0xBL);
         return oldHash ^ ((oldHash * mul) >> shift);
     }
 
@@ -115,7 +188,7 @@ public class BloomFilterUtilsTest {
                 falsePositives++;
             }
         }
-        assertEquals(113, falsePositives);
+        assertEquals(1101, falsePositives);
     }
     @Test
     public void negativeHashCode() {