You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:53 UTC

(datasketches-java) 06/06: Test BloomFilterBuilder methods vs manually computed values from formulae

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit c55ea437f695426acd3b1acb78f4d7d9506ae8d7
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 22:52:49 2024 -0800

    Test BloomFilterBuilder methods vs manually computed values from formulae
---
 .../filters/bloomfilter/BloomFilterBuilder.java    |  23 ++--
 .../bloomfilter/BloomFilterBuilderTest.java        | 118 +++++++++++++++++++++
 2 files changed, 131 insertions(+), 10 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 938889d8..0934338d 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import java.util.concurrent.ThreadLocalRandom;
+
 import org.apache.datasketches.common.SketchesArgumentException;
 
 public final class BloomFilterBuilder {
@@ -27,10 +29,13 @@ public final class BloomFilterBuilder {
    * Returns the optimal number of hash functions to given target numbers of distinct items
    * and the BloomFliter size in bits.
    * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
-   * @param numFilterBits The target size, in bits, of the Bloom Filter
+   * @param numFilterBits The intended size of the Bloom Filter in bits
    * @return The suggested number of hash functions to use with the filter
    */
   public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
+    if (maxDistinctItems < 1 || numFilterBits < 1) {
+      throw new SketchesArgumentException("maxDistinctItems and numFilterBits must be strictly positive");
+    }
     // ceil to ensure we never average worse than the target
     return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
   }
@@ -72,15 +77,7 @@ public final class BloomFilterBuilder {
    * @return A new BloomFilter configured for the given input parameters
    */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
-    if (maxDistinctItems <= 0) {
-      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
-    }
-    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
-      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
-    }
-    final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
-    final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
-    return new BloomFilter(numBits, numHashes);
+    return newBloomFilter(maxDistinctItems, targetFalsePositiveProb, ThreadLocalRandom.current().nextLong());
   }
 
   /**
@@ -92,6 +89,12 @@ public final class BloomFilterBuilder {
    * @return A new BloomFilter configured for the given input parameters
    */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes, seed);
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
new file mode 100644
index 00000000..d3a08f85
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.fail;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.testng.annotations.Test;
+
+public class BloomFilterBuilderTest {
+
+  @Test
+  public void testSuggestHashesFromSizes() {
+    // invalid distinct items
+    try {
+      BloomFilterBuilder.suggestNumHashes(0, 32768);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // invalid filter size
+    try {
+      BloomFilterBuilder.suggestNumHashes(10_000, -1);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumHashes(100, 1L << 16), 455);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(10_000, 1L << 12), 1);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1_000_000_000, Integer.MAX_VALUE * 4L), 6);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1_500_000, 1L << 24), 8);
+  }
+
+  @Test
+  public void testSuggestHashesFromProbability() {
+    // negative probability
+    try {
+      BloomFilterBuilder.suggestNumHashes(-0.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // probability > 1
+    try {
+      BloomFilterBuilder.suggestNumHashes(2.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumHashes(0.333), 2);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(0.01), 7);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1e-12), 40);
+  }
+
+  @Test
+  public void testSuggestNumFilterBits() {
+    // non-positive number distincts
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(0, 0.01);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // non-positive probability
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(2500, 0.0);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // probability > 1
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(1000, 2.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumFilterBits(250_000, 0.01), 2396265);
+    BloomFilter bf = BloomFilterBuilder.newBloomFilter(250_000, 0.01);
+    assertEquals(bf.getCapacity(), 2396288); // next smallest multiple of 64
+    assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(250_000, 2396288));
+
+    assertEquals(BloomFilterBuilder.suggestNumFilterBits(5_000_000, 1e-4), 95850584);
+    final long seed = 19805243;
+    bf = BloomFilterBuilder.newBloomFilter(5_000_000, 1e-4, seed);
+    assertEquals(bf.getCapacity(), 95850624); // next smallest multiple of 64
+    assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(5_000_000, 95850624));
+    assertEquals(bf.getSeed(), seed);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org