You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:53 UTC
(datasketches-java) 06/06: Test BloomFilterBuilder methods vs manually computed values from formulae
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit c55ea437f695426acd3b1acb78f4d7d9506ae8d7
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 22:52:49 2024 -0800
Test BloomFilterBuilder methods vs manually computed values from formulae
---
.../filters/bloomfilter/BloomFilterBuilder.java | 23 ++--
.../bloomfilter/BloomFilterBuilderTest.java | 118 +++++++++++++++++++++
2 files changed, 131 insertions(+), 10 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 938889d8..0934338d 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,6 +19,8 @@
package org.apache.datasketches.filters.bloomfilter;
+import java.util.concurrent.ThreadLocalRandom;
+
import org.apache.datasketches.common.SketchesArgumentException;
public final class BloomFilterBuilder {
@@ -27,10 +29,13 @@ public final class BloomFilterBuilder {
* Returns the optimal number of hash functions to given target numbers of distinct items
* and the BloomFliter size in bits.
* @param maxDistinctItems The maximum expected number of distinct items to add to the filter
- * @param numFilterBits The target size, in bits, of the Bloom Filter
+ * @param numFilterBits The intended size of the Bloom Filter in bits
* @return The suggested number of hash functions to use with the filter
*/
public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
+ if (maxDistinctItems < 1 || numFilterBits < 1) {
+ throw new SketchesArgumentException("maxDistinctItems and numFilterBits must be strictly positive");
+ }
// ceil to ensure we never average worse than the target
return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
}
@@ -72,15 +77,7 @@ public final class BloomFilterBuilder {
* @return A new BloomFilter configured for the given input parameters
*/
public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
- if (maxDistinctItems <= 0) {
- throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
- }
- if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
- throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
- }
- final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
- final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
- return new BloomFilter(numBits, numHashes);
+ return newBloomFilter(maxDistinctItems, targetFalsePositiveProb, ThreadLocalRandom.current().nextLong());
}
/**
@@ -92,6 +89,12 @@ public final class BloomFilterBuilder {
* @return A new BloomFilter configured for the given input parameters
*/
public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
+ if (maxDistinctItems <= 0) {
+ throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+ }
+ if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
return new BloomFilter(numBits, numHashes, seed);
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
new file mode 100644
index 00000000..d3a08f85
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.fail;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.testng.annotations.Test;
+
+public class BloomFilterBuilderTest {
+
+ @Test
+ public void testSuggestHashesFromSizes() {
+ // invalid distinct items
+ try {
+ BloomFilterBuilder.suggestNumHashes(0, 32768);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // invalid filter size
+ try {
+ BloomFilterBuilder.suggestNumHashes(10_000, -1);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // manually computed values based on formula
+ assertEquals(BloomFilterBuilder.suggestNumHashes(100, 1L << 16), 455);
+ assertEquals(BloomFilterBuilder.suggestNumHashes(10_000, 1L << 12), 1);
+ assertEquals(BloomFilterBuilder.suggestNumHashes(1_000_000_000, Integer.MAX_VALUE * 4L), 6);
+ assertEquals(BloomFilterBuilder.suggestNumHashes(1_500_000, 1L << 24), 8);
+ }
+
+ @Test
+ public void testSuggestHashesFromProbability() {
+ // negative probability
+ try {
+ BloomFilterBuilder.suggestNumHashes(-0.5);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // probability > 1
+ try {
+ BloomFilterBuilder.suggestNumHashes(2.5);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // manually computed values based on formula
+ assertEquals(BloomFilterBuilder.suggestNumHashes(0.333), 2);
+ assertEquals(BloomFilterBuilder.suggestNumHashes(0.01), 7);
+ assertEquals(BloomFilterBuilder.suggestNumHashes(1e-12), 40);
+ }
+
+ @Test
+ public void testSuggestNumFilterBits() {
+ // non-positive number distincts
+ try {
+ BloomFilterBuilder.suggestNumFilterBits(0, 0.01);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // non-positive probability
+ try {
+ BloomFilterBuilder.suggestNumFilterBits(2500, 0.0);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // probability > 1
+ try {
+ BloomFilterBuilder.suggestNumFilterBits(1000, 2.5);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // manually computed values based on formula
+ assertEquals(BloomFilterBuilder.suggestNumFilterBits(250_000, 0.01), 2396265);
+ BloomFilter bf = BloomFilterBuilder.newBloomFilter(250_000, 0.01);
+ assertEquals(bf.getCapacity(), 2396288); // next smallest multiple of 64
+ assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(250_000, 2396288));
+
+ assertEquals(BloomFilterBuilder.suggestNumFilterBits(5_000_000, 1e-4), 95850584);
+ final long seed = 19805243;
+ bf = BloomFilterBuilder.newBloomFilter(5_000_000, 1e-4, seed);
+ assertEquals(bf.getCapacity(), 95850624); // next smallest multiple of 64
+ assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(5_000_000, 95850624));
+ assertEquals(bf.getSeed(), seed);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org