You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2010/02/16 04:36:46 UTC

svn commit: r910383 - in /incubator/cassandra/branches/cassandra-0.5: ./ src/java/org/apache/cassandra/db/ src/java/org/apache/cassandra/io/ src/java/org/apache/cassandra/tools/ src/java/org/apache/cassandra/utils/ test/unit/org/apache/cassandra/utils/

Author: jbellis
Date: Tue Feb 16 03:36:45 2010
New Revision: 910383

URL: http://svn.apache.org/viewvc?rev=910383&view=rev
Log:
allow relaxing desired bloom filter buckets/element when there is no other way to fit it in a BitSet.  patch by Stu Hood; reviewed by jbellis for CASSANDRA-790

Removed:
    incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/tools/ThreadListBuilder.java
Modified:
    incubator/cassandra/branches/cassandra-0.5/CHANGES.txt
    incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java
    incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java
    incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java
    incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java
    incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
    incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java

Modified: incubator/cassandra/branches/cassandra-0.5/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/CHANGES.txt?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/CHANGES.txt (original)
+++ incubator/cassandra/branches/cassandra-0.5/CHANGES.txt Tue Feb 16 03:36:45 2010
@@ -5,7 +5,10 @@
    (CASSANDRA-703)
  * more accurate load estimate for bootstrapping (CASSANDRA-762)
  * tolerate dead or unavailable bootstrap target on write (CASSANDRA-731)
-
+ * allow larger numbers of keys (> 140M) in a sstable bloom filter
+   (CASSANDRA-790)
+ * include jvm argument improvements from CASSANDRA-504 in debian package
+ 
 
 0.5.0 final
  * avoid attempting to delete temporary bootstrap files twice (CASSANDRA-681)

Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java Tue Feb 16 03:36:45 2010
@@ -83,7 +83,7 @@
             columnCount += column.getObjectCount();
         }
 
-        BloomFilter bf = new BloomFilter(columnCount, 4);
+        BloomFilter bf = BloomFilter.getFilter(columnCount, 4);
         for (IColumn column : columns)
         {
             bf.add(column.name());

Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java Tue Feb 16 03:36:45 2010
@@ -49,7 +49,7 @@
         super(filename, partitioner);
         dataFile = new BufferedRandomAccessFile(path, "rw", (int)(DatabaseDescriptor.getFlushDataBufferSizeInMB() * 1024 * 1024));
         indexFile = new BufferedRandomAccessFile(indexFilename(), "rw", (int)(DatabaseDescriptor.getFlushIndexBufferSizeInMB() * 1024 * 1024));
-        bf = new BloomFilter((int)keyCount, 15); // TODO fix long -> int cast
+        bf = BloomFilter.getFilter(keyCount, 15);
     }
 
     private long beforeAppend(DecoratedKey decoratedKey) throws IOException

Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java Tue Feb 16 03:36:45 2010
@@ -27,12 +27,10 @@
  * Filter class by helping to choose correct values of 'bits per element' and
  * 'number of hash functions, k'.
  */
-public class BloomCalculations {
+class BloomCalculations {
 
-    private static final int maxBuckets = 15;
     private static final int minBuckets = 2;
     private static final int minK = 1;
-    private static final int maxK = 8;
     private static final int[] optKPerBuckets =
             new int[]{1, // dummy K for 0 buckets per element
                       1, // dummy K for 1 buckets per element
@@ -65,17 +63,17 @@
     };  // the first column is a dummy column representing K=0.
 
     /**
-     * Given the number of buckets that can be used per element, return the optimal
-     * number of hash functions in order to minimize the false positive rate.
+     * Given the number of buckets that can be used per element, return a
+     * specification that minimizes the false positive rate.
      *
-     * @param bucketsPerElement
-     * @return The number of hash functions that minimize the false positive rate.
+     * @param bucketsPerElement The number of buckets per element for the filter.
+     * @return A spec that minimizes the false positive rate.
      */
-    public static int computeBestK(int bucketsPerElement){
-        assert bucketsPerElement >= 0;
-        if(bucketsPerElement >= optKPerBuckets.length)
-            return optKPerBuckets[optKPerBuckets.length-1];
-        return optKPerBuckets[bucketsPerElement];
+    public static BloomSpecification computeBloomSpec(int bucketsPerElement)
+    {
+        assert bucketsPerElement >= 1;
+        assert bucketsPerElement <= probs.length - 1;
+        return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
     }
 
     /**
@@ -100,17 +98,25 @@
      * is considered more expensive than computing power, preference is given
      * to minimizing buckets per element rather than number of hash functions.
      *
+     * @param maxBucketsPerElement The maximum number of buckets available for the filter.
      * @param maxFalsePosProb The maximum tolerable false positive rate.
      * @return A Bloom Specification which would result in a false positive rate
-     * less than specified by the function call.
+     * less than specified by the function call
+     * @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
      */
-    public static BloomSpecification computeBucketsAndK(double maxFalsePosProb){
+    public static BloomSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb)
+    {
+        assert maxBucketsPerElement >= 1;
+        assert maxBucketsPerElement <= probs.length - 1;
+        int maxK = probs[maxBucketsPerElement].length - 1;
+
         // Handle the trivial cases
         if(maxFalsePosProb >= probs[minBuckets][minK]) {
             return new BloomSpecification(2, optKPerBuckets[2]);
         }
-        if(maxFalsePosProb < probs[maxBuckets][maxK]) {
-            return new BloomSpecification(maxK, maxBuckets);
+        if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
+            throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
+                                                                  maxFalsePosProb, maxBucketsPerElement));
         }
 
         // First find the minimal required number of buckets:

Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java Tue Feb 16 03:36:45 2010
@@ -25,10 +25,15 @@
 
 import org.apache.cassandra.io.ICompactSerializer;
 
+import org.apache.log4j.Logger;
+
 public class BloomFilter extends Filter
 {
+    private static final Logger logger = Logger.getLogger(BloomFilter.class);
     static ICompactSerializer<BloomFilter> serializer_ = new BloomFilterSerializer();
 
+    private static final int EXCESS = 20;
+
     public static ICompactSerializer<BloomFilter> serializer()
     {
         return serializer_;
@@ -36,26 +41,63 @@
 
     private BitSet filter_;
 
-    public BloomFilter(int numElements, int bucketsPerElement)
+    BloomFilter(int hashes, BitSet filter)
     {
-        this(BloomCalculations.computeBestK(bucketsPerElement), new BitSet(numElements * bucketsPerElement + 20));
+        hashCount = hashes;
+        filter_ = filter;
     }
 
-    public BloomFilter(int numElements, double maxFalsePosProbability)
+    private static BitSet bucketsFor(long numElements, int bucketsPer)
     {
-        BloomCalculations.BloomSpecification spec = BloomCalculations
-                .computeBucketsAndK(maxFalsePosProbability);
-        filter_ = new BitSet(numElements * spec.bucketsPerElement + 20);
-        hashCount = spec.K;
+        long numBits = numElements * bucketsPer + EXCESS;
+        return new BitSet((int)Math.min(Integer.MAX_VALUE, numBits));
     }
 
-    /*
-     * This version is only used by the deserializer.
+    /**
+     * Calculates the maximum number of buckets per element that this implementation
+     * can support.  Crucially, it will lower the bucket count if necessary to meet
+     * BitSet's size restrictions.
      */
-    BloomFilter(int hashes, BitSet filter)
+    private static int maxBucketsPerElement(long numElements)
     {
-        hashCount = hashes;
-        filter_ = filter;
+        numElements = Math.max(1, numElements);
+        double v = (Integer.MAX_VALUE - EXCESS) / (double)numElements;
+        if (v < 1.0)
+        {
+            throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
+        }
+        return Math.min(BloomCalculations.probs.length - 1, (int)v);
+    }
+
+    /**
+     * @return A BloomFilter with the lowest practical false positive probability
+     * for the given number of elements.
+     */
+    public static BloomFilter getFilter(long numElements, int targetBucketsPerElem)
+    {
+        int maxBucketsPerElement = Math.max(1, maxBucketsPerElement(numElements));
+        int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement);
+        if (bucketsPerElement < targetBucketsPerElem)
+        {
+            logger.warn(String.format("Cannot provide an optimal BloomFilter for %d elements (%d/%d buckets per element).",
+                                      numElements, bucketsPerElement, targetBucketsPerElem));
+        }
+        BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement);
+        return new BloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
+    }
+
+    /**
+     * @return The smallest BloomFilter that can provide the given false positive
+     * probability rate for the given number of elements.
+     *
+     * Asserts that the given probability can be satisfied using this filter.
+     */
+    public static BloomFilter getFilter(long numElements, double maxFalsePosProbability)
+    {
+        assert maxFalsePosProbability <= 1.0 : "Invalid probability";
+        int bucketsPerElement = maxBucketsPerElement(numElements);
+        BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
+        return new BloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
     }
 
     public void clear()

Modified: incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java Tue Feb 16 03:36:45 2010
@@ -26,13 +26,10 @@
 public class BloomFilterTest
 {
     public BloomFilter bf;
-    public BloomCalculations.BloomSpecification spec = BloomCalculations.computeBucketsAndK(0.0001);
-    static final int ELEMENTS = 10000;
 
     public BloomFilterTest()
     {
-        bf = new BloomFilter(ELEMENTS, spec.bucketsPerElement);
-        assert bf != null;
+        bf = BloomFilter.getFilter(FilterTest.ELEMENTS, FilterTest.MAX_FAILURE_RATE);
     }
 
     @Before
@@ -41,6 +38,19 @@
         bf.clear();
     }
 
+    @Test(expected=UnsupportedOperationException.class)
+    public void testBloomLimits1()
+    {
+        int maxBuckets = BloomCalculations.probs.length - 1;
+        int maxK = BloomCalculations.probs[maxBuckets].length - 1;
+
+        // possible
+        BloomCalculations.computeBloomSpec(maxBuckets, BloomCalculations.probs[maxBuckets][maxK]);
+
+        // impossible, throws
+        BloomCalculations.computeBloomSpec(maxBuckets, BloomCalculations.probs[maxBuckets][maxK] / 2);
+    }
+
     @Test
     public void testOne()
     {
@@ -68,7 +78,7 @@
         {
             return;
         }
-        BloomFilter bf2 = new BloomFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTest.spec.bucketsPerElement);
+        BloomFilter bf2 = BloomFilter.getFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTest.MAX_FAILURE_RATE);
         int skipEven = KeyGenerator.WordGenerator.WORDS % 2 == 0 ? 0 : 2;
         FilterTest.testFalsePositives(bf2,
                                       new KeyGenerator.WordGenerator(skipEven, 2),

Modified: incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java Tue Feb 16 03:36:45 2010
@@ -56,7 +56,7 @@
     // used by filter subclass tests
 
     static final double MAX_FAILURE_RATE = 0.1;
-    public static final BloomCalculations.BloomSpecification spec = BloomCalculations.computeBucketsAndK(MAX_FAILURE_RATE);
+    public static final BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(15, MAX_FAILURE_RATE);
     static final int ELEMENTS = 10000;
 
     static final ResetableIterator<String> intKeys()