You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:51 UTC

(datasketches-java) 04/06: finish javadocs for builder class

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 533cb81c67dc88644917a18411e50c31c926002c
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 18:36:52 2024 -0800

    finish javadocs for builder class
---
 .../filters/bloomfilter/BloomFilterBuilder.java    | 52 +++++++++++++++++++++-
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 46d48a26..938889d8 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,31 +19,79 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import org.apache.datasketches.common.SketchesArgumentException;
+
 public final class BloomFilterBuilder {
 
+  /**
+   * Returns the optimal number of hash functions to given target numbers of distinct items
+   * and the BloomFliter size in bits.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param numFilterBits The target size, in bits, of the Bloom Filter
+   * @return The suggested number of hash functions to use with the filter
+   */
   public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
     // ceil to ensure we never average worse than the target
     return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
   }
 
+  /**
+   * Returns the optimal number of hash functions to achieve a target false positive probability.
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return The suggested number of hash functions to use with the filter.
+   */
   public static short suggestNumHashes(final double targetFalsePositiveProb) {
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     // ceil to ensure we never average worse than the target
     return (short) Math.ceil((- Math.log(targetFalsePositiveProb) / Math.log(2)));
   }
 
+  /**
+   * Returns the optimal number of bits to use in a Bloom Filter given a target number of distinct
+   * items and a target false positive probability.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return The suggested number of bits to use with the filter
+   */
   public static long suggestNumFilterBits(final long maxDistinctItems, final double targetFalsePositiveProb) {
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     return (long) Math.round(-maxDistinctItems * Math.log(targetFalsePositiveProb) / (Math.log(2) * Math.log(2)));
   }
 
+  /**
+   * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return A new BloomFilter configured for the given input parameters
+   */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
-    // TODO validate inputs
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes);
   }
 
+  /**
+   * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs,
+   * using the provided base seed for the hash function.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @param seed A base hash seed 
+   * @return A new BloomFilter configured for the given input parameters
+   */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
-    // TODO validate inputs
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes, seed);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org