You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:50 UTC

(datasketches-java) 03/06: finish javadocs and fix missiing return types on some methods

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 933fce340e1c4126972a7afc84e1c0e12da8ffcc
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 12:43:46 2024 -0800

    finish javadocs and fix missiing return types on some methods
---
 .../datasketches/filters/bloomfilter/BitArray.java |  18 +-
 .../filters/bloomfilter/BloomFilter.java           | 270 ++++++++++++++++++---
 .../filters/bloomfilter/BitArrayTest.java          |   2 +-
 3 files changed, 252 insertions(+), 38 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
index c6dfae16..4f428e5f 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
@@ -38,6 +38,7 @@ final class BitArray {
   private long numBitsSet_;  // if -1, need to recompute value
   private long[] data_;
 
+  // creates an array of a given size
   BitArray(final long numBits) {
     if (numBits <= 0) {
       throw new SketchesArgumentException("Number of bits must be strictly positive. Found: " + numBits);
@@ -51,6 +52,7 @@ final class BitArray {
     data_ = new long[numLongs];
   }
 
+  // uses the provided array
   BitArray(final long[] data) {
     data_ = data;
     numBitsSet_ = 0;
@@ -59,6 +61,8 @@ final class BitArray {
     }
   }
 
+  // reads a serialized image, but the BitArray is not fully self-describing so requires
+  // a flag to indicate whether the array is empty
   static BitArray heapify(final Memory mem, final boolean isEmpty) {
     final int numLongs = mem.getInt(0);
     if (numLongs < 0) {
@@ -78,10 +82,14 @@ final class BitArray {
     return numBitsSet_ == 0;
   }
 
+  // queries a single bit in the array
   boolean getBit(final long index) {
     return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false;
   }
 
+  // sets a single bit in the array without querying, meaning the method
+  // cannot properly track the number of bits set. 
+  // instead changes numBitsSet_ to indicate that the value is unreliable
   void setBit(final long index) {
     data_[(int) index >>> 6] |= 1L << index;
     numBitsSet_ = -1; // use as a dirty flag
@@ -95,11 +103,14 @@ final class BitArray {
       return true; // already seen
     } else {
       data_[offset] |= mask;
-      ++numBitsSet_;
+      if (numBitsSet_ != -1) { ++numBitsSet_; }
       return false; // new set
     }
   }
 
+  // may need to recompute value:
+  // O(1) if only getAndSetBit() has been used
+  // O(data_.length) if setBit() has ever been used
   long getNumBitsSet() {
     if (numBitsSet_ == -1) {
       numBitsSet_ = 0;
@@ -177,6 +188,7 @@ final class BitArray {
     }
   }
 
+  // prints the raw BitArray as 0s and 1s, one long per row
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder();
@@ -188,7 +200,8 @@ final class BitArray {
     return sb.toString();
   }
 
-  String printLong(final long val) {
+  // prints a long as a series of 0s and 1s as little endian
+  private String printLong(final long val) {
     final StringBuilder sb = new StringBuilder();
     for (int j = 0; j < Long.SIZE; ++j) {
       sb.append((val & (1L << j)) != 0 ? "1" : "0");
@@ -197,6 +210,7 @@ final class BitArray {
     return sb.toString();
   }
 
+  // clears the array
   void reset() {
     final int numLongs = data_.length;
     data_ = new long[numLongs];
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index b576ad9f..d7919de5 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import static org.apache.datasketches.common.Util.LS;
+
 import java.nio.charset.StandardCharsets;
 import java.util.concurrent.ThreadLocalRandom;
 
@@ -197,6 +199,12 @@ public final class BloomFilter {
 
   /**
    * Updates the filter with the provided String.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
    * @param item an item with which to update the filter
    */
   public void update(final String item) {
@@ -284,12 +292,26 @@ public final class BloomFilter {
   }
 
   // QUERY-AND-UPDATE METHODS 
+
+  /**
+   * Updates the filter with the provided long and 
+   * returns the result from quering that value prior to the update.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
   public boolean queryAndUpdate(final long item) {
     final long h0 = XxHash.hashLong(item, seed_);
     final long h1 = XxHash.hashLong(item, h0);
     return queryAndUpdateInternal(h0, h1);
   }
   
+  /**
+   * Updates the filter with the provided double and 
+   * returns the result from quering that value prior to the update.
+   * The double is canonicalized (NaN and +/- infinity) in the call.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
   public boolean queryAndUpdate(final double item) {
     // canonicalize all NaN & +/- infinity forms    
     final long[] data = { Double.doubleToLongBits(item) };
@@ -298,6 +320,18 @@ public final class BloomFilter {
     return queryAndUpdateInternal(h0, h1);
   }
 
+  /**
+   * Updates the filter with the provided String and 
+   * returns the result from quering that value prior to the update.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #queryAndUpdate(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update, or false if item is null
+   */
   public boolean queryAndUpdate(final String item) {
     if (item == null || item.isEmpty()) { return false; }
     final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
@@ -306,47 +340,84 @@ public final class BloomFilter {
     return queryAndUpdateInternal(h0, h1);
   }
 
+  /**
+   * Updates the filter with the provided byte[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
   public boolean queryAndUpdate(final byte[] data) {
     final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
     return queryAndUpdateInternal(h0, h1);
   }
   
-  public void queryAndUpdate(final char[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided char[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final char[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final short[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided short[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final short[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final int[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided int[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final int[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final long[] data) {
-    if (data == null) { return; }
+  /**
+   * Updates the filter with the provided long[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data is null
+   */
+  public boolean queryAndUpdate(final long[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
-  public void queryAndUpdate(final Memory mem) {
-    if (mem == null) { return; }
+  /**
+   * Updates the filter with the provided Memory and 
+   * returns the result from quering that Memory prior to the update.
+   * @param mem an array with which to update the filter
+   * @return The query result prior to applying the update, or false if mem is null
+   */
+  public boolean queryAndUpdate(final Memory mem) {
+    if (mem == null) { return false; }
     final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
     final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
-    queryAndUpdateInternal(h0, h1);
+    return queryAndUpdateInternal(h0, h1);
   }
 
+  // Internal query-and-update method given pre-computed hashes
   private boolean queryAndUpdateInternal(final long h0, final long h1) {
     final long numBits = bitArray_.getCapacity();
     boolean valueAlreadyExists = true;
@@ -359,12 +430,29 @@ public final class BloomFilter {
   }
 
   // QUERY METHODS
+  /**
+   * Queries the filter with the provided long and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
   public boolean query(final long item) {
     final long h0 = XxHash.hashLong(item, seed_);
     final long h1 = XxHash.hashLong(item, h0);
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible. Double values are
+   * canonicalized (NaN and +/- infinity) before querying.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
   public boolean query(final double item) {
     // canonicalize all NaN & +/- infinity forms    
     final long[] data = { Double.doubleToLongBits(item) };
@@ -373,6 +461,20 @@ public final class BloomFilter {
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
+   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item, or false if item is null
+   */
   public boolean query(final String item) {
     if (item == null || item.isEmpty()) { return false; }    
     final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
@@ -381,47 +483,97 @@ public final class BloomFilter {
     return queryInternal(h0, h1);
   }
 
+  /**
+   * Queries the filter with the provided byte[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
   public boolean query(final byte[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
     return queryInternal(h0, h1);
   }
 
-  public void query(final char[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided char[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final char[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final short[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided short[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final short[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final int[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided int[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final int[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final long[] data) {
-    if (data == null) { return; }
+  /**
+   * Queries the filter with the provided long[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false if data is null
+   */
+  public boolean query(final long[] data) {
+    if (data == null) { return false; }
     final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
     final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
-  public void query(final Memory mem) {
-    if (mem == null) { return; }
+  /**
+   * Queries the filter with the provided Memory and returns whether the
+   * data <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param mem a Memory array with which to query the filter
+   * @return The result of querying the filter with the given Memory, or false if data is null
+   */
+  public boolean query(final Memory mem) {
+    if (mem == null) { return false; }
     final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
     final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
-    queryInternal(h0, h1);
+    return queryInternal(h0, h1);
   }
 
+  // Internal method to query the filter given pre-computed hashes
   private boolean queryInternal(final long h0, final long h1) {
     final long numBits = bitArray_.getCapacity();
     for (int i = 1; i <= numHashes_; ++i) {
@@ -435,7 +587,13 @@ public final class BloomFilter {
   }
 
   // OTHER OPERATIONS
+  /**
+   * Unions two BloomFilters by applying a logical OR. The result will recognized
+   * any values seen by either filter (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
   public void union(final BloomFilter other) {
+    if (other == null) { return; }
     if (!isCompatible(other)) {
       throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
     }
@@ -443,7 +601,13 @@ public final class BloomFilter {
     bitArray_.union(other.bitArray_);
   }
 
+  /**
+   * Intersects two BloomFilters by applying a logical AND. The result will recognize
+   * only values seen by both filters (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
   public void intersect(final BloomFilter other) {
+    if (other == null) { return; }
     if (!isCompatible(other)) {
       throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
     }
@@ -451,12 +615,21 @@ public final class BloomFilter {
     bitArray_.intersect(other.bitArray_);
   }
 
+  /**
+   * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
+   */
   public void invert() {
     bitArray_.invert();
   }
 
+  /**
+   * Helps identify if two BloomFilters may be unioned or intersected.
+   * @param other A BloomFilter to check for compatibility with this one
+   * @return True if the filters are compatible, otherwise false
+   */
   public boolean isCompatible(final BloomFilter other) {
-    if (seed_ != other.seed_
+    if (other == null
+        || seed_ != other.seed_
         || numHashes_ != other.numHashes_
         || bitArray_.getArrayLength() != other.bitArray_.getArrayLength()) {
           return false;
@@ -464,16 +637,26 @@ public final class BloomFilter {
     return true;
   }
 
+  /**
+   * Returns the length of this BloomFilter when serialized, in bytes
+   * @return The length of this BloomFilter when serialized, in bytes
+   */
   public long getSerializedSizeBytes() {
     long sizeBytes = 2 * Long.BYTES; // basic sketch info + baseSeed
     sizeBytes += bitArray_.getSerializedSizeBytes();
     return sizeBytes;
   }
 
+  /**
+   * Serializes the current BloomFilter to an array of bytes.
+   *
+   * <p>Note: Method throws if the serialized size exceeds <tt>Integer.MAX_VALUE</tt>.</p>
+   * @return A serialized image of the current BloomFilter as byte[]
+   */
   public byte[] toByteArray() {
     final long sizeBytes = getSerializedSizeBytes();
     if (sizeBytes > Integer.MAX_VALUE) {
-      throw new SketchesStateException("Cannot serialize a Bloom Filter of this size using toByteArray(); use toLongArray() instead.");
+      throw new SketchesStateException("Cannot serialize a BloomFilter of this size using toByteArray(); use toLongArray() instead.");
     }
 
     final byte[] bytes = new byte[(int) sizeBytes];
@@ -494,6 +677,12 @@ public final class BloomFilter {
     return bytes;
   }
 
+  /**
+   * Serializes the current BloomFilter to an array of longs. Unlike {@link #toByteArray()},
+   * this method can handle any size filter.
+   *
+   * @return A serialized image of the current BloomFilter as long[]
+   */
   public long[] toLongArray() {
     final long sizeBytes = getSerializedSizeBytes();
 
@@ -515,14 +704,25 @@ public final class BloomFilter {
     return longs;
   }
 
-  private static void checkArgument(final boolean val, final String message) {
-    if (val) { throw new SketchesArgumentException(message); }
+  // Throws an exception with the provided message if the given condition is false
+  private static void checkArgument(final boolean condition, final String message) {
+    if (condition) { throw new SketchesArgumentException(message); }
   }
 
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder();
-    sb.append(bitArray_.toString());
+
+    sb.append(LS);
+    final String thisSimpleName = this.getClass().getSimpleName();
+    sb.append("### ").append(thisSimpleName).append(" SUMMARY: ").append(LS);
+    sb.append("   numBits      : ").append(bitArray_.getCapacity()).append(LS);
+    sb.append("   numHashes    : ").append(numHashes_).append(LS);
+    sb.append("   seed         : ").append(seed_).append(LS);
+    sb.append("   bitsUsed     : ").append(bitArray_.getNumBitsSet()).append(LS);
+    sb.append("   fill %       : ").append(getFillPercentage()).append(LS);
+    sb.append("### END SKETCH SUMMARY").append(LS);
+
     return sb.toString();
   }
 }
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
index 319f7777..2fa146e2 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BitArrayTest.java
@@ -28,7 +28,7 @@ import org.apache.datasketches.memory.WritableMemory;
 import org.testng.annotations.Test;
 
 public class BitArrayTest {
-  
+
   @Test
   public void createBitArrayTest() {
     final BitArray ba = new BitArray(119);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org