You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:51 UTC
(datasketches-java) 04/06: finish javadocs for builder class
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 533cb81c67dc88644917a18411e50c31c926002c
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 18:36:52 2024 -0800
finish javadocs for builder class
---
.../filters/bloomfilter/BloomFilterBuilder.java | 52 +++++++++++++++++++++-
1 file changed, 50 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 46d48a26..938889d8 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,31 +19,79 @@
package org.apache.datasketches.filters.bloomfilter;
+import org.apache.datasketches.common.SketchesArgumentException;
+
public final class BloomFilterBuilder {
+ /**
+ * Returns the optimal number of hash functions to given target numbers of distinct items
+ * and the BloomFliter size in bits.
+ * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+ * @param numFilterBits The target size, in bits, of the Bloom Filter
+ * @return The suggested number of hash functions to use with the filter
+ */
public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
// ceil to ensure we never average worse than the target
return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
}
+ /**
+ * Returns the optimal number of hash functions to achieve a target false positive probability.
+ * @param targetFalsePositiveProb A desired false positive probability per item
+ * @return The suggested number of hash functions to use with the filter.
+ */
public static short suggestNumHashes(final double targetFalsePositiveProb) {
+ if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
// ceil to ensure we never average worse than the target
return (short) Math.ceil((- Math.log(targetFalsePositiveProb) / Math.log(2)));
}
+ /**
+ * Returns the optimal number of bits to use in a Bloom Filter given a target number of distinct
+ * items and a target false positive probability.
+ * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+ * @param targetFalsePositiveProb A desired false positive probability per item
+ * @return The suggested number of bits to use with the filter
+ */
public static long suggestNumFilterBits(final long maxDistinctItems, final double targetFalsePositiveProb) {
+ if (maxDistinctItems <= 0) {
+ throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+ }
+ if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
return (long) Math.round(-maxDistinctItems * Math.log(targetFalsePositiveProb) / (Math.log(2) * Math.log(2)));
}
+ /**
+ * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs.
+ * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+ * @param targetFalsePositiveProb A desired false positive probability per item
+ * @return A new BloomFilter configured for the given input parameters
+ */
public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
- // TODO validate inputs
+ if (maxDistinctItems <= 0) {
+ throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+ }
+ if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+ throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+ }
final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
return new BloomFilter(numBits, numHashes);
}
+ /**
+ * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs,
+ * using the provided base seed for the hash function.
+ * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+ * @param targetFalsePositiveProb A desired false positive probability per item
+ * @param seed A base hash seed
+ * @return A new BloomFilter configured for the given input parameters
+ */
public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
- // TODO validate inputs
final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
return new BloomFilter(numBits, numHashes, seed);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org