You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2008/02/07 21:10:43 UTC
svn commit: r619620 - in /hadoop/hbase/trunk: CHANGES.txt
src/java/org/onelab/filter/CountingBloomFilter.java
src/test/org/onelab/test/TestFilter.java
Author: stack
Date: Thu Feb 7 12:10:40 2008
New Revision: 619620
URL: http://svn.apache.org/viewvc?rev=619620&view=rev
Log:
HBASE-19 CountingBloomFilter can overflow its storage
Modified:
hadoop/hbase/trunk/CHANGES.txt
hadoop/hbase/trunk/src/java/org/onelab/filter/CountingBloomFilter.java
hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java
Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=619620&r1=619619&r2=619620&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Thu Feb 7 12:10:40 2008
@@ -9,6 +9,8 @@
OPTIMIZATIONS
BUG FIXES
+ HBASE-19 CountingBloomFilter can overflow its storage
+ (Stu Hood and Bryan Duxbury via Stack)
IMPROVEMENTS
HBASE-415 Rewrite leases to use DelayedBlockingQueue instead of polling
Modified: hadoop/hbase/trunk/src/java/org/onelab/filter/CountingBloomFilter.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/onelab/filter/CountingBloomFilter.java?rev=619620&r1=619619&r2=619620&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/onelab/filter/CountingBloomFilter.java (original)
+++ hadoop/hbase/trunk/src/java/org/onelab/filter/CountingBloomFilter.java Thu Feb 7 12:10:40 2008
@@ -50,6 +50,7 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.util.Arrays;
/**
* Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
@@ -61,15 +62,18 @@
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
- * @version 1.0 - 5 Feb. 07
+ * @version 1.1 - 19 Jan. 08
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
*/
public final class CountingBloomFilter extends Filter {
- /** Counter vector. */
- private byte[] vector;
+ /** Storage for the counting buckets */
+ private long[] buckets;
+
+ /** We are using 4bit buckets, so each bucket can count to 15 */
+ private final static long BUCKET_MAX_VALUE = 15;
/** Default constructor - use with readFields */
public CountingBloomFilter() {}
@@ -81,9 +85,15 @@
*/
public CountingBloomFilter(int vectorSize, int nbHash){
super(vectorSize, nbHash);
- vector = new byte[vectorSize];
+ buckets = new long[buckets2words(vectorSize)];
}//end constructor
+ /** returns the number of 64 bit words it would take to hold vectorSize buckets */
+ private static int buckets2words(int vectorSize) {
+ return ((vectorSize - 1) >>> 4) + 1;
+ }
+
+
/** {@inheritDoc} */
@Override
public void add(Key key) {
@@ -95,7 +105,18 @@
hash.clear();
for(int i = 0; i < nbHash; i++) {
- vector[h[i]]++;
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+
+ // only increment if the count in the bucket is less than BUCKET_MAX_VALUE
+ if(bucketValue < BUCKET_MAX_VALUE) {
+ // increment by 1
+ buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
+ }
}
}//end add()
@@ -117,8 +138,17 @@
hash.clear();
for(int i = 0; i < nbHash; i++) {
- if(vector[h[i]] >= 1) {
- vector[h[i]]--;
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+
+ // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
+ if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
+ // decrement by 1
+ buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
}
}
}//end delete
@@ -133,9 +163,10 @@
throw new IllegalArgumentException("filters cannot be and-ed");
}
CountingBloomFilter cbf = (CountingBloomFilter)filter;
-
- for(int i = 0; i < vectorSize; i++) {
- this.vector[i] &= cbf.vector[i];
+
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ this.buckets[i] &= cbf.buckets[i];
}
}//end and()
@@ -150,7 +181,13 @@
hash.clear();
for(int i = 0; i < nbHash; i++) {
- if(vector[h[i]] == 0) {
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+
+ if((buckets[wordNum] & bucketMask) == 0) {
return false;
}
}
@@ -177,8 +214,9 @@
CountingBloomFilter cbf = (CountingBloomFilter)filter;
- for(int i = 0; i < vectorSize; i++) {
- this.vector[i] |= cbf.vector[i];
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ this.buckets[i] |= cbf.buckets[i];
}
}//end or()
@@ -199,7 +237,14 @@
if(i > 0) {
res.append(" ");
}
- res.append(vector[i]&0xff);
+
+ int wordNum = i >> 4; // div 16
+ int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+
+ res.append(bucketValue);
}
return res.toString();
@@ -209,7 +254,7 @@
@Override
public Object clone(){
CountingBloomFilter cbf = new CountingBloomFilter(vectorSize, nbHash);
- cbf.or(this);
+ cbf.buckets = this.buckets.clone();
return cbf;
}//end clone()
@@ -219,8 +264,9 @@
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
- for(int i = 0; i < vector.length; i++) {
- out.writeByte(vector[i]);
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ out.writeLong(buckets[i]);
}
}
@@ -228,9 +274,10 @@
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
- vector = new byte[vectorSize];
- for(int i = 0; i < vector.length; i++) {
- vector[i] = in.readByte();
+ int sizeInWords = buckets2words(vectorSize);
+ buckets = new long[sizeInWords];
+ for(int i = 0; i < sizeInWords; i++) {
+ buckets[i] = in.readLong();
}
}
}//end class
Modified: hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java?rev=619620&r1=619619&r2=619620&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java (original)
+++ hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java Thu Feb 7 12:10:40 2008
@@ -94,6 +94,27 @@
assertTrue(bf.membershipTest(new StringKey("graknyl")));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
+
+ // delete 'key', and check that it is no longer a member
+ ((CountingBloomFilter)bf).delete(key);
+ assertFalse(bf.membershipTest(key));
+
+ // OR 'key' back into the filter
+ Filter bf2 = new CountingBloomFilter(8, 2);
+ bf2.add(key);
+ bf.or(bf2);
+ assertTrue(bf.membershipTest(key));
+ assertTrue(bf.membershipTest(new StringKey("graknyl")));
+ assertFalse(bf.membershipTest(new StringKey("xyzzy")));
+ assertFalse(bf.membershipTest(new StringKey("abcd")));
+
+ // to test for overflows, add 'key' enough times to overflow an 8bit bucket,
+ // while asserting that it stays a member
+ for(int i = 0; i < 257; i++){
+ bf.add(key);
+ assertTrue(bf.membershipTest(key));
+ }
+
}
/** Test a DynamicBloomFilter