You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/07/17 02:45:40 UTC

svn commit: r964990 - in /hbase/trunk: CHANGES.txt src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java

Author: stack
Date: Sat Jul 17 00:45:40 2010
New Revision: 964990

URL: http://svn.apache.org/viewvc?rev=964990&view=rev
Log:
HBASE-2843 Re-add bloomfilter test over-zealously removed by HBASE-2625 

Added:
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java
Modified:
    hbase/trunk/CHANGES.txt

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=964990&r1=964989&r2=964990&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Jul 17 00:45:40 2010
@@ -438,6 +438,7 @@ Release 0.21.0 - Unreleased
                in the right state (Karthik Ranganathan via JD)
    HBASE-2727  Splits writing one file only is untenable; need dir of recovered
                edits ordered by sequenceid
+   HBASE-2843  Readd bloomfilter test over zealously removed by HBASE-2625 
 
   IMPROVEMENTS
    HBASE-1760  Cleanup TODOs in HTable

Added: hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java?rev=964990&view=auto
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java (added)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestByteBloomFilter.java Sat Jul 17 00:45:40 2010
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+
+import junit.framework.TestCase;
+
+public class TestByteBloomFilter extends TestCase {
+  
+  public void testBasicBloom() throws Exception {
+    ByteBloomFilter bf1 = new ByteBloomFilter(1000, (float)0.01, Hash.MURMUR_HASH, 0);
+    ByteBloomFilter bf2 = new ByteBloomFilter(1000, (float)0.01, Hash.MURMUR_HASH, 0);
+    bf1.allocBloom();
+    bf2.allocBloom();
+    
+    // test 1: verify no fundamental false negatives or positives
+    byte[] key1 = {1,2,3,4,5,6,7,8,9};
+    byte[] key2 = {1,2,3,4,5,6,7,8,7};
+    
+    bf1.add(key1);
+    bf2.add(key2);
+    
+    assertTrue(bf1.contains(key1));
+    assertFalse(bf1.contains(key2));
+    assertFalse(bf2.contains(key1));
+    assertTrue(bf2.contains(key2));
+    
+    byte [] bkey = {1,2,3,4};
+    byte [] bval = "this is a much larger byte array".getBytes();
+    
+    bf1.add(bkey);
+    bf1.add(bval, 1, bval.length-1);
+    
+    assertTrue( bf1.contains(bkey) );
+    assertTrue( bf1.contains(bval, 1, bval.length-1) );
+    assertFalse( bf1.contains(bval) );
+    assertFalse( bf1.contains(bval) );
+    
+    // test 2: serialization & deserialization.  
+    // (convert bloom to byte array & read byte array back in as input)
+    ByteArrayOutputStream bOut = new ByteArrayOutputStream();
+    bf1.writeBloom(new DataOutputStream(bOut));
+    ByteBuffer bb = ByteBuffer.wrap(bOut.toByteArray()); 
+    ByteBloomFilter newBf1 = new ByteBloomFilter(1000, (float)0.01,
+        Hash.MURMUR_HASH, 0);
+    assertTrue(newBf1.contains(key1, bb));
+    assertFalse(newBf1.contains(key2, bb));
+    assertTrue( newBf1.contains(bkey, bb) );
+    assertTrue( newBf1.contains(bval, 1, bval.length-1, bb) );
+    assertFalse( newBf1.contains(bval, bb) );
+    assertFalse( newBf1.contains(bval, bb) );
+    
+    System.out.println("Serialized as " + bOut.size() + " bytes");
+    assertTrue(bOut.size() - bf1.byteSize < 10); //... allow small padding
+  }
+  
+  public void testBloomFold() throws Exception {
+    // test: foldFactor < log(max/actual)
+    ByteBloomFilter b = new ByteBloomFilter(1003, (float)0.01, Hash.MURMUR_HASH, 2);
+    b.allocBloom();
+    int origSize = b.getByteSize();
+    assertEquals(1204, origSize);
+    for (int i = 0; i < 12; ++i) {
+      b.add(Bytes.toBytes(i));
+    }
+    b.compactBloom();
+    assertEquals(origSize>>2, b.getByteSize());
+    int falsePositives = 0;
+    for (int i = 0; i < 25; ++i) {
+      if (b.contains(Bytes.toBytes(i))) {
+        if(i >= 12) falsePositives++;
+      } else {
+        assertFalse(i < 12);
+      }
+    }
+    assertTrue(falsePositives <= 1);
+
+    // test: foldFactor > log(max/actual)
+  }
+
+  public void testBloomPerf() throws Exception {
+    // add
+    float err = (float)0.01;
+    ByteBloomFilter b = new ByteBloomFilter(10*1000*1000, (float)err, Hash.MURMUR_HASH, 3);
+    b.allocBloom();
+    long startTime =  System.currentTimeMillis();
+    int origSize = b.getByteSize();
+    for (int i = 0; i < 1*1000*1000; ++i) {
+      b.add(Bytes.toBytes(i));
+    }
+    long endTime = System.currentTimeMillis();
+    System.out.println("Total Add time = " + (endTime - startTime) + "ms");
+
+    // fold
+    startTime = System.currentTimeMillis();
+    b.compactBloom();
+    endTime = System.currentTimeMillis();
+    System.out.println("Total Fold time = " + (endTime - startTime) + "ms");
+    assertTrue(origSize >= b.getByteSize()<<3);
+    
+    // test
+    startTime = System.currentTimeMillis();
+    int falsePositives = 0;
+    for (int i = 0; i < 2*1000*1000; ++i) {
+      
+      if (b.contains(Bytes.toBytes(i))) {
+        if(i >= 1*1000*1000) falsePositives++;
+      } else {
+        assertFalse(i < 1*1000*1000);
+      }
+    }
+    endTime = System.currentTimeMillis();
+    System.out.println("Total Contains time = " + (endTime - startTime) + "ms");
+    System.out.println("False Positive = " + falsePositives);
+    assertTrue(falsePositives <= (1*1000*1000)*err);
+
+    // test: foldFactor > log(max/actual)
+  }
+
+  public void testDynamicBloom() throws Exception {
+    int keyInterval = 1000;
+    float err = (float)0.01;
+    BitSet valid = new BitSet(keyInterval*4);
+
+    DynamicByteBloomFilter bf1 = new DynamicByteBloomFilter(keyInterval, err,
+        Hash.MURMUR_HASH);
+    bf1.allocBloom();
+    
+    for (int i = 0; i < keyInterval*4; ++i) { // add
+      if (Math.random() > 0.5) {
+        bf1.add(Bytes.toBytes(i));
+        valid.set(i);
+      }
+    }
+    assertTrue(2 <= bf1.bloomCount() && bf1.bloomCount() <= 3);
+
+    // test serialization/deserialization
+    ByteArrayOutputStream metaOut = new ByteArrayOutputStream();
+    ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
+    bf1.getMetaWriter().write(new DataOutputStream(metaOut));
+    bf1.getDataWriter().write(new DataOutputStream(dataOut));
+    ByteBuffer bb = ByteBuffer.wrap(dataOut.toByteArray()); 
+    DynamicByteBloomFilter newBf1 = new DynamicByteBloomFilter(
+        ByteBuffer.wrap(metaOut.toByteArray()));
+
+    int falsePositives = 0;
+    for (int i = 0; i < keyInterval*4; ++i) { // check
+      if (newBf1.contains(Bytes.toBytes(i), bb)) {
+        if (!valid.get(i)) ++falsePositives;
+      } else {
+        if (valid.get(i)) {
+          assert false;
+        }
+      }
+    }
+    
+    // note that actualErr = err * bloomCount
+    // error rate should be roughly: (keyInterval*2)*(err*2), allow some tolerance
+    System.out.println("False positives: " + falsePositives);
+    assertTrue(falsePositives <= (keyInterval*5)*err); 
+  }
+}
\ No newline at end of file