You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:47 UTC

(datasketches-java) branch bloom updated (43c21376 -> c55ea437)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git


    from 43c21376 Merge pull request #512 from jmalkin/bloom
     new c4315274 Merge branch 'bloom' of github.com:apache/datasketches-java into bloom
     new 257f2430 Merge branch 'bloom' of github.com:apache/datasketches-java into bloom
     new 933fce34 finish javadocs and fix missiing return types on some methods
     new 533cb81c finish javadocs for builder class
     new f1aadaa8 Finish main filter tests, add preamble format
     new c55ea437 Test BloomFilterBuilder methods vs manually computed values from formulae

The 6 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../datasketches/filters/bloomfilter/BitArray.java |  18 +-
 .../filters/bloomfilter/BloomFilter.java           | 289 ++++++++++++++++++---
 .../filters/bloomfilter/BloomFilterBuilder.java    |  61 ++++-
 .../filters/bloomfilter/BitArrayTest.java          |   2 +-
 .../bloomfilter/BloomFilterBuilderTest.java        | 118 +++++++++
 .../filters/bloomfilter/BloomFilterTest.java       | 131 +++++++++-
 6 files changed, 575 insertions(+), 44 deletions(-)
 create mode 100644 src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 04/06: finish javadocs for builder class

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 533cb81c67dc88644917a18411e50c31c926002c
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 18:36:52 2024 -0800

    finish javadocs for builder class
---
 .../filters/bloomfilter/BloomFilterBuilder.java    | 52 +++++++++++++++++++++-
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 46d48a26..938889d8 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,31 +19,79 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import org.apache.datasketches.common.SketchesArgumentException;
+
 public final class BloomFilterBuilder {
 
+  /**
+   * Returns the optimal number of hash functions to given target numbers of distinct items
+   * and the BloomFliter size in bits.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param numFilterBits The target size, in bits, of the Bloom Filter
+   * @return The suggested number of hash functions to use with the filter
+   */
   public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
     // ceil to ensure we never average worse than the target
     return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
   }
 
+  /**
+   * Returns the optimal number of hash functions to achieve a target false positive probability.
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return The suggested number of hash functions to use with the filter.
+   */
   public static short suggestNumHashes(final double targetFalsePositiveProb) {
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     // ceil to ensure we never average worse than the target
     return (short) Math.ceil((- Math.log(targetFalsePositiveProb) / Math.log(2)));
   }
 
+  /**
+   * Returns the optimal number of bits to use in a Bloom Filter given a target number of distinct
+   * items and a target false positive probability.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return The suggested number of bits to use with the filter
+   */
   public static long suggestNumFilterBits(final long maxDistinctItems, final double targetFalsePositiveProb) {
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     return (long) Math.round(-maxDistinctItems * Math.log(targetFalsePositiveProb) / (Math.log(2) * Math.log(2)));
   }
 
+  /**
+   * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @return A new BloomFilter configured for the given input parameters
+   */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
-    // TODO validate inputs
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes);
   }
 
+  /**
+   * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs,
+   * using the provided base seed for the hash function.
+   * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
+   * @param targetFalsePositiveProb A desired false positive probability per item
+   * @param seed A base hash seed 
+   * @return A new BloomFilter configured for the given input parameters
+   */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
-    // TODO validate inputs
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes, seed);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 02/06: Merge branch 'bloom' of github.com:apache/datasketches-java into bloom

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 257f24307e19939c2e7e946785a33412eeeadaae
Merge: c4315274 43c21376
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 00:09:06 2024 -0800

    Merge branch 'bloom' of github.com:apache/datasketches-java into bloom



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 01/06: Merge branch 'bloom' of github.com:apache/datasketches-java into bloom

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit c4315274bb5c545166d0951571805573ed68af67
Merge: 61fdab97 ed1de413
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 00:07:38 2024 -0800

    Merge branch 'bloom' of github.com:apache/datasketches-java into bloom

 .../apache/datasketches/partitions/BoundsRule.java |   7 +-
 .../datasketches/partitions/Partitioner.java       |  22 ++--
 .../quantiles/ItemsSketchSortedView.java           | 112 ++++++++++++++++-----
 .../apache/datasketches/quantiles/ItemsUtil.java   |  54 +++++-----
 .../quantilescommon/GenericSortedViewIterator.java |  25 ++++-
 .../quantilescommon/SortedViewIterator.java        |  35 +++++--
 .../datasketches/sampling/EbppsItemsSample.java    |  82 ++++++++-------
 .../datasketches/sampling/EbppsItemsSketch.java    | 105 +++++++++++--------
 .../apache/datasketches/sampling/PreambleUtil.java |  12 +--
 .../ItemsSketchPartitionBoundariesTest.java        |  88 ++++++++++++++++
 .../quantiles/ItemsSketchSortedViewString.java     |   5 +-
 .../datasketches/quantiles/ItemsSketchTest.java    |   3 +-
 .../quantilescommon/CrossCheckQuantilesTest.java   |   2 +-
 ...psSampleTest.java => EbppsItemsSampleTest.java} |   4 +-
 ...psSketchTest.java => EbppsItemsSketchTest.java} |  25 +++--
 15 files changed, 413 insertions(+), 168 deletions(-)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 03/06: finish javadocs and fix missiing return types on some methods

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 933fce340e1c4126972a7afc84e1c0e12da8ffcc
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 12:43:46 2024 -0800

    finish javadocs and fix missiing return types on some methods
---
 .../datasketches/filters/bloomfilter/BitArray.java |  18 +-
 .../filters/bloomfilter/BloomFilter.java           | 270 ++++++++++++++++++---
 .../filters/bloomfilter/BitArrayTest.java          |   2 +-
 3 files changed, 252 insertions(+), 38 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
index c6dfae16..4f428e5f 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
@@ -38,6 +38,7 @@ final class BitArray {
   private long numBitsSet_;  // if -1, need to recompute value
   private long[] data_;
 
+  // creates an array of a given size
   BitArray(final long numBits) {
     if (numBits <= 0) {
       throw new SketchesArgumentException("Number of bits must be strictly positive. Found: " + numBits);
@@ -51,6 +52,7 @@ final class BitArray {
     data_ = new long[numLongs];
   }
 
+  // uses the provided array
   BitArray(final long[] data) {
     data_ = data;
     numBitsSet_ = 0;
@@ -59,6 +61,8 @@ final class BitArray {
     }
   }
 
+  // reads a serialized image, but the BitArray is not fully self-describing so requires
+  // a flag to indicate whether the array is empty
   static BitArray heapify(final Memory mem, final boolean isEmpty) {
     final int numLongs = mem.getInt(0);
     if (numLongs < 0) {
@@ -78,10 +82,14 @@ final class BitArray {
     return numBitsSet_ == 0;
   }
 
+  // queries a single bit in the array
   boolean getBit(final long index) {
     return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false;
   }
 
+  // sets a single bit in the array without querying, meaning the method
+  // cannot properly track the number of bits set. 
+  // instead changes numBitsSet_ to indicate that the value is unreliable
   void setBit(final long index) {
     data_[(int) index >>> 6] |= 1L << index;
     numBitsSet_ = -1; // use as a dirty flag
@@ -95,11 +103,14 @@ final class BitArray {
       return true; // already seen
     } else {
       data_[offset] |= mask;
-      ++numBitsSet_;
+      if (numBitsSet_ != -1) { ++numBitsSet_; }
       return false; // new set
     }
   }
 
+  // may need to recompute value:
+  // O(1) if only getAndSetBit() has been used
+  // O(data_.length) if setBit() has ever been used
   long getNumBitsSet() {
     if (numBitsSet_ == -1) {
       numBitsSet_ = 0;
@@ -177,6 +188,7 @@ final class BitArray {
     }
   }
 
+  // prints the raw BitArray as 0s and 1s, one long per row
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder();
@@ -188,7 +200,8 @@ final class BitArray {
     return sb.toString();
   }
 
-  String printLong(final long val) {
+  // prints a long as a series of 0s and 1s as little endian
+  private String printLong(final long val) {
     final StringBuilder sb = new StringBuilder();
     for (int j = 0; j < Long.SIZE; ++j) {
       sb.append((val & (1L << j)) != 0 ? "1" : "0");
@@ -197,6 +210,7 @@ final class BitArray {
     return sb.toString();
   }
 
+  // clears the array
   void reset() {
     final int numLongs = data_.length;
     data_ = new long[numLongs];
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index b576ad9f..d7919de5 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import static org.apache.datasketches.common.Util.LS;
+
 import java.nio.charset.StandardCharsets;
 import java.util.concurrent.ThreadLocalRandom;
 
@@ -197,6 +199,12 @@ public final class BloomFilter {
 
   /**
    * Updates the filter with the provided String.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
    * @param item an item with which to update the filter
    */
   public void update(final String item) {
@@ -284,12 +292,26 @@ public final class BloomFilter {
   }
 
   // QUERY-AND-UPDATE METHODS 
+
+  /**
+   * Updates the filter with the provided long and 
+   * returns the result from quering that value prior to the update.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
   public boolean queryAndUpdate(final long item) {
     final long h0 = XxHash.hashLong(item, seed_);
     final long h1 = XxHash.hashLong(item, h0);
     return queryAndUpdateInternal(h0, h1);
   }
   
+  /**
+   * Updates the filter with the provided double and 
+   * returns the result from quering that value prior to the update.
+   * The double is canonicalized (NaN and +/- infinity) in the call.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
   public boolean queryAndUpdate(final double item) {
     // canonicalize all NaN & +/- infinity forms    
     final long[] data = { Double.doubleToLongBits(item) };
@@ -298,6 +320,18 @@ public final class BloomFilter {
     return queryAndUpdateInternal(h0, h1);
   }
 
+  /**
+   * Updates the filter with the provided String and 
+   * returns the result from quering that value prior to the update.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #queryAndUpdate(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update, or false if item is null
+   */
   public boolean queryAndUpdate(final String item) {
     if (item == null || item.isEmpty()) { return false; }
     final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
@@ -306,47 +340,84 @@ public final class BloomFilter {
     return queryAndUpdateInternal(h0, h1);
   }
 
+  /**
+   * Updates the filter with the provided byte[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
   public boolean queryAndUpdate(final byte[] data) {
     final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
     return queryAndUpdateInternal(h0, h1);
   }
   
-  public void queryAndUpdate(final char[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided char[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final char[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final short[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided short[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final short[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final int[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided int[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final int[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final long[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided long[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final long[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final Memory mem) {
-    if (mem == null) { return; }
+  /**
+   * Updates the filter with the provided Memory and 
+   * returns the result from quering that Memory prior to the update.
+   * @param mem an array with which to update the filter
+   * @return The query result prior to applying the update, or false if mem is null
+   */
+  public boolean queryAndUpdate(final Memory mem) {
+    if (mem == null) { return false; }
     final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
     final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
+  // Internal query-and-update method given pre-computed hashes
   private boolean queryAndUpdateInternal(final long h0, final long h1) {
     final long numBits = bitArray_.getCapacity();
     boolean valueAlreadyExists = true;
@@ -359,12 +430,29 @@ public final class BloomFilter {
   }
 
   // QUERY METHODS
+  /**
+   * Queries the filter with the provided long and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
   public boolean query(final long item) {
     final long h0 = XxHash.hashLong(item, seed_);
     final long h1 = XxHash.hashLong(item, h0);
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible. Double values are
+   * canonicalized (NaN and +/- infinity) before querying.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
   public boolean query(final double item) {
     // canonicalize all NaN & +/- infinity forms    
     final long[] data = { Double.doubleToLongBits(item) };
@@ -373,6 +461,20 @@ public final class BloomFilter {
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item, or false if item is null
+   */
   public boolean query(final String item) {
     if (item == null || item.isEmpty()) { return false; }    
     final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
@@ -381,47 +483,97 @@ public final class BloomFilter {
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided byte[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
   public boolean query(final byte[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
     return queryInternal(h0, h1);
   }
 
-  public void query(final char[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided char[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final char[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final short[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided short[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final short[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final int[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided int[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final int[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final long[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided long[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final long[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final Memory mem) {
-    if (mem == null) { return; }
+  /**
+   * Queries the filter with the provided Memory and returns whether the
+   * data <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param mem a Memory array with which to query the filter
+   * @return The result of querying the filter with the given Memory, or false if data is null
+   */
+  public boolean query(final Memory mem) {
+    if (mem == null) { return false; }
     final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
     final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
+  // Internal method to query the filter given pre-computed hashes
   private boolean queryInternal(final long h0, final long h1) {
     final long numBits = bitArray_.getCapacity();
     for (int i = 1; i <= numHashes_; ++i) {
@@ -435,7 +587,13 @@ public final class BloomFilter {
   }
 
   // OTHER OPERATIONS
+  /**
+   * Unions two BloomFilters by applying a logical OR. The result will recognized
+   * any values seen by either filter (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
   public void union(final BloomFilter other) {
+    if (other == null) { return; }
     if (!isCompatible(other)) {
       throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
     }
@@ -443,7 +601,13 @@ public final class BloomFilter {
     bitArray_.union(other.bitArray_);
   }
 
+  /**
+   * Intersects two BloomFilters by applying a logical AND. The result will recognize
+   * only values seen by both filters (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
   public void intersect(final BloomFilter other) {
+    if (other == null) { return; }
     if (!isCompatible(other)) {
       throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
     }
@@ -451,12 +615,21 @@ public final class BloomFilter {
     bitArray_.intersect(other.bitArray_);
   }
 
+  /**
+   * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
+   */
   public void invert() {
     bitArray_.invert();
   }
 
+  /**
+   * Helps identify if two BloomFilters may be unioned or intersected.
+   * @param other A BloomFilter to check for compatibility with this one
+   * @return True if the filters are compatible, otherwise false
+   */
   public boolean isCompatible(final BloomFilter other) {
-    if (seed_ != other.seed_
+    if (other == null
+        || seed_ != other.seed_
         || numHashes_ != other.numHashes_
         || bitArray_.getArrayLength() != other.bitArray_.getArrayLength()) {
           return false;
@@ -464,16 +637,26 @@ public final class BloomFilter {
     return true;
   }
 
+  /**
+   * Returns the length of this BloomFilter when serialized, in bytes
+   * @return The length of this BloomFilter when serialized, in bytes
+   */
   public long getSerializedSizeBytes() {
     long sizeBytes = 2 * Long.BYTES; // basic sketch info + baseSeed
     sizeBytes += bitArray_.getSerializedSizeBytes();
     return sizeBytes;
   }
 
+  /**
+   * Serializes the current BloomFilter to an array of bytes.
+   *
+   * <p>Note: Method throws if the serialized size exceeds <tt>Integer.MAX_VALUE</tt>.</p>
+   * @return A serialized image of the current BloomFilter as byte[]
+   */
   public byte[] toByteArray() {
     final long sizeBytes = getSerializedSizeBytes();
     if (sizeBytes > Integer.MAX_VALUE) {
-      throw new SketchesStateException("Cannot serialize a Bloom Filter of this size using toByteArray(); use toLongArray() instead.");
+      throw new SketchesStateException("Cannot serialize a BloomFilter of this size using toByteArray(); use toLongArray() instead.");
     }
 
     final byte[] bytes = new byte[(int) sizeBytes];
@@ -494,6 +677,12 @@ public final class BloomFilter {
     return bytes;
   }
 
+  /**
+   * Serializes the current BloomFilter to an array of longs. Unlike {@link #toByteArray()},
+   * this method can handle any size filter.
+   *
+   * @return A serialized image of the current BloomFilter as long[]
+   */
   public long[] toLongArray() {
     final long sizeBytes = getSerializedSizeBytes();
 
@@ -515,14 +704,25 @@ public final class BloomFilter {
     return longs;
   }
 
-  private static void checkArgument(final boolean val, final String message) {
-    if (val) { throw new SketchesArgumentException(message); }
+  // Throws an exception with the provided message if the given condition is false
+  private static void checkArgument(final boolean condition, final String message) {
+    if (condition) { throw new SketchesArgumentException(message); }
   }
 
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder();
-    sb.append(bitArray_.toString());
+
+    sb.append(LS);
+    final String thisSimpleName = this.getClass().getSimpleName();
+    sb.append("### ").append(thisSimpleName).append(" SUMMARY: ").append(LS);
+    sb.append("   numBits      : ").append(bitArray_.getCapacity()).append(LS);
+    sb.append("   numHashes    : ").append(numHashes_).append(LS);
+    sb.append("   seed         : ").append(seed_).append(LS);
+    sb.append("   bitsUsed     : ").append(bitArray_.getNumBitsSet()).append(LS);
+    sb.append("   fill %       : ").append(getFillPercentage()).append(LS);
+    sb.append("### END SKETCH SUMMARY").append(LS);
+
     return sb.toString();
   }
 }
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
index 319f7777..2fa146e2 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
@@ -28,7 +28,7 @@ import org.apache.datasketches.memory.WritableMemory;
 import org.testng.annotations.Test;
 
 public class BitArrayTest {
-  
+
   @Test
   public void createBitArrayTest() {
     final BitArray ba = new BitArray(119);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 06/06: Test BloomFilterBuilder methods vs manually computed values from formulae

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit c55ea437f695426acd3b1acb78f4d7d9506ae8d7
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 22:52:49 2024 -0800

    Test BloomFilterBuilder methods vs manually computed values from formulae
---
 .../filters/bloomfilter/BloomFilterBuilder.java    |  23 ++--
 .../bloomfilter/BloomFilterBuilderTest.java        | 118 +++++++++++++++++++++
 2 files changed, 131 insertions(+), 10 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 938889d8..0934338d 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import java.util.concurrent.ThreadLocalRandom;
+
 import org.apache.datasketches.common.SketchesArgumentException;
 
 public final class BloomFilterBuilder {
@@ -27,10 +29,13 @@ public final class BloomFilterBuilder {
    * Returns the optimal number of hash functions to given target numbers of distinct items
    * and the BloomFliter size in bits.
    * @param maxDistinctItems The maximum expected number of distinct items to add to the filter
-   * @param numFilterBits The target size, in bits, of the Bloom Filter
+   * @param numFilterBits The intended size of the Bloom Filter in bits
    * @return The suggested number of hash functions to use with the filter
    */
   public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) {
+    if (maxDistinctItems < 1 || numFilterBits < 1) {
+      throw new SketchesArgumentException("maxDistinctItems and numFilterBits must be strictly positive");
+    }
     // ceil to ensure we never average worse than the target
     return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0)));
   }
@@ -72,15 +77,7 @@ public final class BloomFilterBuilder {
    * @return A new BloomFilter configured for the given input parameters
    */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) {
-    if (maxDistinctItems <= 0) {
-      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
-    }
-    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
-      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
-    }
-    final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
-    final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
-    return new BloomFilter(numBits, numHashes);
+    return newBloomFilter(maxDistinctItems, targetFalsePositiveProb, ThreadLocalRandom.current().nextLong());
   }
 
   /**
@@ -92,6 +89,12 @@ public final class BloomFilterBuilder {
    * @return A new BloomFilter configured for the given input parameters
    */
   public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) {
+    if (maxDistinctItems <= 0) {
+      throw new SketchesArgumentException("maxDistinctItems must be strictly positive");
+    }
+    if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+      throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0");
+    }
     final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb);
     final short numHashes = suggestNumHashes(maxDistinctItems, numBits);
     return new BloomFilter(numBits, numHashes, seed);
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
new file mode 100644
index 00000000..d3a08f85
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.fail;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.testng.annotations.Test;
+
+public class BloomFilterBuilderTest {
+
+  @Test
+  public void testSuggestHashesFromSizes() {
+    // invalid distinct items
+    try {
+      BloomFilterBuilder.suggestNumHashes(0, 32768);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // invalid filter size
+    try {
+      BloomFilterBuilder.suggestNumHashes(10_000, -1);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumHashes(100, 1L << 16), 455);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(10_000, 1L << 12), 1);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1_000_000_000, Integer.MAX_VALUE * 4L), 6);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1_500_000, 1L << 24), 8);
+  }
+
+  @Test
+  public void testSuggestHashesFromProbability() {
+    // negative probability
+    try {
+      BloomFilterBuilder.suggestNumHashes(-0.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // probability > 1
+    try {
+      BloomFilterBuilder.suggestNumHashes(2.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumHashes(0.333), 2);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(0.01), 7);
+    assertEquals(BloomFilterBuilder.suggestNumHashes(1e-12), 40);
+  }
+
+  @Test
+  public void testSuggestNumFilterBits() {
+    // non-positive number distincts
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(0, 0.01);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // non-positive probability
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(2500, 0.0);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // probability > 1
+    try {
+      BloomFilterBuilder.suggestNumFilterBits(1000, 2.5);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // manually computed values based on formula
+    assertEquals(BloomFilterBuilder.suggestNumFilterBits(250_000, 0.01), 2396265);
+    BloomFilter bf = BloomFilterBuilder.newBloomFilter(250_000, 0.01);
+    assertEquals(bf.getCapacity(), 2396288); // next smallest multiple of 64
+    assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(250_000, 2396288));
+
+    assertEquals(BloomFilterBuilder.suggestNumFilterBits(5_000_000, 1e-4), 95850584);
+    final long seed = 19805243;
+    bf = BloomFilterBuilder.newBloomFilter(5_000_000, 1e-4, seed);
+    assertEquals(bf.getCapacity(), 95850624); // next smallest multiple of 64
+    assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(5_000_000, 95850624));
+    assertEquals(bf.getSeed(), seed);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 05/06: Finish main filter tests, add preamble format

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit f1aadaa89ee0d98c7181074b4680dfc63cfc50ee
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 20:50:23 2024 -0800

    Finish main filter tests, add preamble format
---
 .../filters/bloomfilter/BloomFilter.java           |  21 +++-
 .../filters/bloomfilter/BloomFilterTest.java       | 131 ++++++++++++++++++++-
 2 files changed, 150 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index d7919de5..5df2777c 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -647,10 +647,29 @@ public final class BloomFilter {
     return sizeBytes;
   }
 
+/*
+ * A Bloom Filter's serialized image always uses 3 longs of preamble, whether empty or not:
+ *
+ * <pre>
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
+ *  0   || Preamble_Longs | SerVer | FamID  |  Flags |----Num Hashes---|-----Unused------|
+ *
+ *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |
+ *  1   ||---------------------------------Hash Seed-------------------------------------|
+ *
+ *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
+ *  2   ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ *  </pre>
+ * 
+ * The raw BitArray bits, if non-empty start at byte 24.
+ */
+
   /**
    * Serializes the current BloomFilter to an array of bytes.
    *
-   * <p>Note: Method throws if the serialized size exceeds <tt>Integer.MAX_VALUE</tt>.</p>
+   * <p>Note: Method throws if the serialized size exceeds <code>Integer.MAX_VALUE</code>.</p>
    * @return A serialized image of the current BloomFilter as byte[]
    */
   public byte[] toByteArray() {
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
index 564fbdab..5c0df995 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
@@ -22,6 +22,7 @@ package org.apache.datasketches.filters.bloomfilter;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.fail;
 
 import org.apache.datasketches.common.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
@@ -101,6 +102,40 @@ public class BloomFilterTest {
 
   }
 
+  @Test
+  public void incompatibleSetOperationsTest() {
+    final int numBits = 128;
+    final int numHashes = 4;
+    final BloomFilter bf1 = new BloomFilter(numBits, numHashes);
+
+    // mismatched num bits
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits * 2, numHashes, bf1.getSeed());
+      bf1.union(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // mismatched num hashes
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits, numHashes * 2, bf1.getSeed());
+      bf1.intersect(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // mismatched seed
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits, numHashes, bf1.getSeed() - 1);
+      bf1.union(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+  }
+
   @Test
   public void basicUnionTest() {
     final long numBits = 12288;
@@ -116,6 +151,7 @@ public class BloomFilterTest {
       bf2.queryAndUpdate(n / 2 + i);
     }
     
+    bf1.union(null); // no-op
     bf1.union(bf2);
     for (int i = 0; i < maxItem; ++i) {
       assertTrue(bf1.query(i));
@@ -144,6 +180,7 @@ public class BloomFilterTest {
       bf2.queryAndUpdate(n / 2 + i);
     }
     
+    bf1.intersect(null); // no-op
     bf1.intersect(bf2);
     for (int i = n / 2; i < n; ++i) {
       assertTrue(bf1.query(i));
@@ -229,5 +266,97 @@ public class BloomFilterTest {
     }
     assertEquals(fromLongsCount, n + count); // same numbers of items should match
   }
-}
 
+  @Test
+  public void testBasicUpdateMethods() {
+    final int numDistinct = 100;
+    final double fpp = 1e-6;
+    final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+
+    // empty/null String should do nothing
+    bf.update("");
+    bf.update((String) null);
+    assertFalse(bf.queryAndUpdate(""));
+    assertFalse(bf.queryAndUpdate((String) null));
+    assertEquals(bf.getBitsUsed(), 0);
+
+    bf.update("abc");
+    assertFalse(bf.queryAndUpdate("def"));
+    bf.update(932);
+    assertFalse(bf.queryAndUpdate(543));
+    bf.update(Double.NaN);
+    assertFalse(bf.queryAndUpdate(Double.POSITIVE_INFINITY));
+    assertTrue(bf.getBitsUsed() <= bf.getNumHashes() * 6);
+    assertFalse(bf.isEmpty());
+  }
+
+  @Test
+  public void testArrayUpdateMethods() {
+    // 3 doubles = 24 bytes
+    final double rawData[] = { 1.414, 2.71, 3.1415926538 };
+    final Memory mem = Memory.wrap(rawData);
+
+    final int numDistinct = 100;
+    final double fpp = 1e-6;
+
+    // for each BloomFilter update type, call update() then queryAndUpdate(), where
+    // the latter should return true. query() should likewise return true.
+    // A final intersection should have the same number of bits set as the raw input.
+    final BloomFilter bfMem = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+    bfMem.update(mem);
+    assertTrue(bfMem.queryAndUpdate(mem));
+    assertTrue(bfMem.query(mem));
+    final long numBitsSet = bfMem.getBitsUsed();
+    final long seed = bfMem.getSeed();
+
+    final BloomFilter bfBytes = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final byte[] bytes = new byte[24];
+    mem.getByteArray(0, bytes, 0, 24);
+    bfBytes.update(bytes);
+    assertTrue(bfBytes.queryAndUpdate(bytes));
+    assertTrue(bfBytes.query(bytes));
+    assertEquals(bfBytes.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfChars = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final char[] chars = new char[12];
+    mem.getCharArray(0, chars, 0, 12);
+    bfChars.update(chars);
+    assertTrue(bfChars.queryAndUpdate(chars));
+    assertTrue(bfChars.query(chars));
+    assertEquals(bfChars.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfShorts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final short[] shorts = new short[12];
+    mem.getShortArray(0, shorts, 0, 12);
+    bfShorts.update(shorts);
+    assertTrue(bfShorts.queryAndUpdate(shorts));
+    assertTrue(bfShorts.query(shorts));
+    assertEquals(bfShorts.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfInts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final int[] ints = new int[6];
+    mem.getIntArray(0, ints, 0, 6);
+    bfInts.update(ints);
+    assertTrue(bfInts.queryAndUpdate(ints));
+    assertTrue(bfInts.query(ints));
+    assertEquals(bfInts.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfLongs = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final long[] longs = new long[3];
+    mem.getLongArray(0, longs, 0, 3);
+    bfLongs.update(longs);
+    assertTrue(bfLongs.queryAndUpdate(longs));
+    assertTrue(bfLongs.query(longs));
+    assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+
+    // intersect all the sketches into a new one
+    final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    bf.intersect(bfMem);
+    bf.intersect(bfBytes);
+    bf.intersect(bfChars);
+    bf.intersect(bfShorts);
+    bf.intersect(bfInts);
+    bf.intersect(bfLongs);
+    assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org