You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2010/02/16 04:36:46 UTC
svn commit: r910383 - in /incubator/cassandra/branches/cassandra-0.5: ./
src/java/org/apache/cassandra/db/ src/java/org/apache/cassandra/io/
src/java/org/apache/cassandra/tools/ src/java/org/apache/cassandra/utils/
test/unit/org/apache/cassandra/utils/
Author: jbellis
Date: Tue Feb 16 03:36:45 2010
New Revision: 910383
URL: http://svn.apache.org/viewvc?rev=910383&view=rev
Log:
allow relaxing desired bloom filter buckets/element when there is no other way to fit it in a BitSet. patch by Stu Hood; reviewed by jbellis for CASSANDRA-790
Removed:
incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/tools/ThreadListBuilder.java
Modified:
incubator/cassandra/branches/cassandra-0.5/CHANGES.txt
incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java
incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java
incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java
incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java
incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java
Modified: incubator/cassandra/branches/cassandra-0.5/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/CHANGES.txt?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/CHANGES.txt (original)
+++ incubator/cassandra/branches/cassandra-0.5/CHANGES.txt Tue Feb 16 03:36:45 2010
@@ -5,7 +5,10 @@
(CASSANDRA-703)
* more accurate load estimate for bootstrapping (CASSANDRA-762)
* tolerate dead or unavailable bootstrap target on write (CASSANDRA-731)
-
+ * allow larger numbers of keys (> 140M) in a sstable bloom filter
+ (CASSANDRA-790)
+ * include jvm argument improvements from CASSANDRA-504 in debian package
+
0.5.0 final
* avoid attempting to delete temporary bootstrap files twice (CASSANDRA-681)
Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/db/ColumnIndexer.java Tue Feb 16 03:36:45 2010
@@ -83,7 +83,7 @@
columnCount += column.getObjectCount();
}
- BloomFilter bf = new BloomFilter(columnCount, 4);
+ BloomFilter bf = BloomFilter.getFilter(columnCount, 4);
for (IColumn column : columns)
{
bf.add(column.name());
Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/io/SSTableWriter.java Tue Feb 16 03:36:45 2010
@@ -49,7 +49,7 @@
super(filename, partitioner);
dataFile = new BufferedRandomAccessFile(path, "rw", (int)(DatabaseDescriptor.getFlushDataBufferSizeInMB() * 1024 * 1024));
indexFile = new BufferedRandomAccessFile(indexFilename(), "rw", (int)(DatabaseDescriptor.getFlushIndexBufferSizeInMB() * 1024 * 1024));
- bf = new BloomFilter((int)keyCount, 15); // TODO fix long -> int cast
+ bf = BloomFilter.getFilter(keyCount, 15);
}
private long beforeAppend(DecoratedKey decoratedKey) throws IOException
Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomCalculations.java Tue Feb 16 03:36:45 2010
@@ -27,12 +27,10 @@
* Filter class by helping to choose correct values of 'bits per element' and
* 'number of hash functions, k'.
*/
-public class BloomCalculations {
+class BloomCalculations {
- private static final int maxBuckets = 15;
private static final int minBuckets = 2;
private static final int minK = 1;
- private static final int maxK = 8;
private static final int[] optKPerBuckets =
new int[]{1, // dummy K for 0 buckets per element
1, // dummy K for 1 buckets per element
@@ -65,17 +63,17 @@
}; // the first column is a dummy column representing K=0.
/**
- * Given the number of buckets that can be used per element, return the optimal
- * number of hash functions in order to minimize the false positive rate.
+ * Given the number of buckets that can be used per element, return a
+ * specification that minimizes the false positive rate.
*
- * @param bucketsPerElement
- * @return The number of hash functions that minimize the false positive rate.
+ * @param bucketsPerElement The number of buckets per element for the filter.
+ * @return A spec that minimizes the false positive rate.
*/
- public static int computeBestK(int bucketsPerElement){
- assert bucketsPerElement >= 0;
- if(bucketsPerElement >= optKPerBuckets.length)
- return optKPerBuckets[optKPerBuckets.length-1];
- return optKPerBuckets[bucketsPerElement];
+ public static BloomSpecification computeBloomSpec(int bucketsPerElement)
+ {
+ assert bucketsPerElement >= 1;
+ assert bucketsPerElement <= probs.length - 1;
+ return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
}
/**
@@ -100,17 +98,25 @@
* is considered more expensive than computing power, preference is given
* to minimizing buckets per element rather than number of hash functions.
*
+ * @param maxBucketsPerElement The maximum number of buckets available for the filter.
* @param maxFalsePosProb The maximum tolerable false positive rate.
* @return A Bloom Specification which would result in a false positive rate
- * less than specified by the function call.
+ * less than specified by the function call
+ * @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
*/
- public static BloomSpecification computeBucketsAndK(double maxFalsePosProb){
+ public static BloomSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb)
+ {
+ assert maxBucketsPerElement >= 1;
+ assert maxBucketsPerElement <= probs.length - 1;
+ int maxK = probs[maxBucketsPerElement].length - 1;
+
// Handle the trivial cases
if(maxFalsePosProb >= probs[minBuckets][minK]) {
return new BloomSpecification(2, optKPerBuckets[2]);
}
- if(maxFalsePosProb < probs[maxBuckets][maxK]) {
- return new BloomSpecification(maxK, maxBuckets);
+ if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
+ throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
+ maxFalsePosProb, maxBucketsPerElement));
}
// First find the minimal required number of buckets:
Modified: incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/src/java/org/apache/cassandra/utils/BloomFilter.java Tue Feb 16 03:36:45 2010
@@ -25,10 +25,15 @@
import org.apache.cassandra.io.ICompactSerializer;
+import org.apache.log4j.Logger;
+
public class BloomFilter extends Filter
{
+ private static final Logger logger = Logger.getLogger(BloomFilter.class);
static ICompactSerializer<BloomFilter> serializer_ = new BloomFilterSerializer();
+ private static final int EXCESS = 20;
+
public static ICompactSerializer<BloomFilter> serializer()
{
return serializer_;
@@ -36,26 +41,63 @@
private BitSet filter_;
- public BloomFilter(int numElements, int bucketsPerElement)
+ BloomFilter(int hashes, BitSet filter)
{
- this(BloomCalculations.computeBestK(bucketsPerElement), new BitSet(numElements * bucketsPerElement + 20));
+ hashCount = hashes;
+ filter_ = filter;
}
- public BloomFilter(int numElements, double maxFalsePosProbability)
+ private static BitSet bucketsFor(long numElements, int bucketsPer)
{
- BloomCalculations.BloomSpecification spec = BloomCalculations
- .computeBucketsAndK(maxFalsePosProbability);
- filter_ = new BitSet(numElements * spec.bucketsPerElement + 20);
- hashCount = spec.K;
+ long numBits = numElements * bucketsPer + EXCESS;
+ return new BitSet((int)Math.min(Integer.MAX_VALUE, numBits));
}
- /*
- * This version is only used by the deserializer.
+ /**
+ * Calculates the maximum number of buckets per element that this implementation
+ * can support. Crucially, it will lower the bucket count if necessary to meet
+ * BitSet's size restrictions.
*/
- BloomFilter(int hashes, BitSet filter)
+ private static int maxBucketsPerElement(long numElements)
{
- hashCount = hashes;
- filter_ = filter;
+ numElements = Math.max(1, numElements);
+ double v = (Integer.MAX_VALUE - EXCESS) / (double)numElements;
+ if (v < 1.0)
+ {
+ throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
+ }
+ return Math.min(BloomCalculations.probs.length - 1, (int)v);
+ }
+
+ /**
+ * @return A BloomFilter with the lowest practical false positive probability
+ * for the given number of elements.
+ */
+ public static BloomFilter getFilter(long numElements, int targetBucketsPerElem)
+ {
+ int maxBucketsPerElement = Math.max(1, maxBucketsPerElement(numElements));
+ int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement);
+ if (bucketsPerElement < targetBucketsPerElem)
+ {
+ logger.warn(String.format("Cannot provide an optimal BloomFilter for %d elements (%d/%d buckets per element).",
+ numElements, bucketsPerElement, targetBucketsPerElem));
+ }
+ BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement);
+ return new BloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
+ }
+
+ /**
+ * @return The smallest BloomFilter that can provide the given false positive
+ * probability rate for the given number of elements.
+ *
+ * Asserts that the given probability can be satisfied using this filter.
+ */
+ public static BloomFilter getFilter(long numElements, double maxFalsePosProbability)
+ {
+ assert maxFalsePosProbability <= 1.0 : "Invalid probability";
+ int bucketsPerElement = maxBucketsPerElement(numElements);
+ BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
+ return new BloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
}
public void clear()
Modified: incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/BloomFilterTest.java Tue Feb 16 03:36:45 2010
@@ -26,13 +26,10 @@
public class BloomFilterTest
{
public BloomFilter bf;
- public BloomCalculations.BloomSpecification spec = BloomCalculations.computeBucketsAndK(0.0001);
- static final int ELEMENTS = 10000;
public BloomFilterTest()
{
- bf = new BloomFilter(ELEMENTS, spec.bucketsPerElement);
- assert bf != null;
+ bf = BloomFilter.getFilter(FilterTest.ELEMENTS, FilterTest.MAX_FAILURE_RATE);
}
@Before
@@ -41,6 +38,19 @@
bf.clear();
}
+ @Test(expected=UnsupportedOperationException.class)
+ public void testBloomLimits1()
+ {
+ int maxBuckets = BloomCalculations.probs.length - 1;
+ int maxK = BloomCalculations.probs[maxBuckets].length - 1;
+
+ // possible
+ BloomCalculations.computeBloomSpec(maxBuckets, BloomCalculations.probs[maxBuckets][maxK]);
+
+ // impossible, throws
+ BloomCalculations.computeBloomSpec(maxBuckets, BloomCalculations.probs[maxBuckets][maxK] / 2);
+ }
+
@Test
public void testOne()
{
@@ -68,7 +78,7 @@
{
return;
}
- BloomFilter bf2 = new BloomFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTest.spec.bucketsPerElement);
+ BloomFilter bf2 = BloomFilter.getFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTest.MAX_FAILURE_RATE);
int skipEven = KeyGenerator.WordGenerator.WORDS % 2 == 0 ? 0 : 2;
FilterTest.testFalsePositives(bf2,
new KeyGenerator.WordGenerator(skipEven, 2),
Modified: incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java?rev=910383&r1=910382&r2=910383&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java (original)
+++ incubator/cassandra/branches/cassandra-0.5/test/unit/org/apache/cassandra/utils/FilterTest.java Tue Feb 16 03:36:45 2010
@@ -56,7 +56,7 @@
// used by filter subclass tests
static final double MAX_FAILURE_RATE = 0.1;
- public static final BloomCalculations.BloomSpecification spec = BloomCalculations.computeBucketsAndK(MAX_FAILURE_RATE);
+ public static final BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(15, MAX_FAILURE_RATE);
static final int ELEMENTS = 10000;
static final ResetableIterator<String> intKeys()