You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/29 05:14:19 UTC

(datasketches-java) branch bloom_review_changes created (now c8993166)

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a change to branch bloom_review_changes
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git


      at c8993166 Changes mostly based on review feedback, but without modifying the review branch since changes are small and scattered

This branch includes the following new commits:

     new c8993166 Changes mostly based on review feedback, but without modifying the review branch since changes are small and scattered

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-java) 01/01: Changes mostly based on review feedback, but without modifying the review branch since changes are small and scattered

Posted by jm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom_review_changes
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit c899316611332de91267f51d3e8313f792ebaf25
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Wed Feb 28 21:13:57 2024 -0800

    Changes mostly based on review feedback, but without modifying the review branch since changes are small and scattered
---
 .../org/apache/datasketches/common/Family.java     |  2 +-
 .../datasketches/filters/bloomfilter/BitArray.java | 49 +++++++++++++---------
 .../filters/bloomfilter/BloomFilter.java           | 14 ++++++-
 .../filters/bloomfilter/BloomFilterBuilder.java    |  8 ++++
 4 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/common/Family.java b/src/main/java/org/apache/datasketches/common/Family.java
index a8a94b4c..38c517c1 100644
--- a/src/main/java/org/apache/datasketches/common/Family.java
+++ b/src/main/java/org/apache/datasketches/common/Family.java
@@ -156,7 +156,7 @@ public enum Family {
   /**
    * Bloom Filter
    */
-  BLOOMFILTER(21, "BLOOMFILTER", 3, 3);
+  BLOOMFILTER(21, "BLOOMFILTER", 4, 4);
   ;
 
   private static final Map<Integer, Family> lookupID = new HashMap<>();
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
index 9d2242ee..dd8943fe 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import java.util.Arrays;
+
 import org.apache.datasketches.common.SketchesArgumentException;
 import org.apache.datasketches.common.SketchesStateException;
 import org.apache.datasketches.memory.Memory;
@@ -33,9 +35,11 @@ import org.apache.datasketches.memory.WritableMemory;
 final class BitArray {
   // MAX_BITS using longs, based on array indices being capped at Integer.MAX_VALUE
   private static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; 
-  private static final long DATA_OFFSET = 8; // offset into memory for start of data array
+  private static final long NUMBITSSET_OFFSET = 8; // offset into memory for numBitsSet
+  private static final long DATA_OFFSET = 16; // offset into memory for start of data array
 
   private long numBitsSet_;  // if -1, need to recompute value
+  private boolean isDirty_;
   private long[] data_;
 
   // creates an array of a given size
@@ -49,16 +53,15 @@ final class BitArray {
 
     final int numLongs = (int) Math.ceil(numBits / 64.0);
     numBitsSet_ = 0;
+    isDirty_ = false;
     data_ = new long[numLongs];
   }
 
   // uses the provided array
-  BitArray(final long[] data) {
+  BitArray(final long numBitsSet, final long[] data) {
     data_ = data;
-    numBitsSet_ = 0;
-    for (long val : data) {
-      numBitsSet_ += Long.bitCount(val);
-    }
+    isDirty_ = numBitsSet < 0;
+    numBitsSet_ = numBitsSet;
   }
 
   // reads a serialized image, but the BitArray is not fully self-describing so requires
@@ -71,15 +74,18 @@ final class BitArray {
     
     if (isEmpty) {
       return new BitArray((long) numLongs * Long.SIZE);
-    }
+    }    
+
+    // will be -1 if dirty
+    final long numBitsSet = mem.getLong(NUMBITSSET_OFFSET);
 
     final long[] data = new long[numLongs];
     mem.getLongArray(DATA_OFFSET, data, 0, numLongs);
-    return new BitArray(data);
+    return new BitArray(numBitsSet, data);
   }
 
   boolean isEmpty() {
-    return numBitsSet_ == 0;
+    return getNumBitsSet() == 0;
   }
 
   // queries a single bit in the array
@@ -88,11 +94,10 @@ final class BitArray {
   }
 
   // sets a single bit in the array without querying, meaning the method
-  // cannot properly track the number of bits set. 
-  // instead changes numBitsSet_ to indicate that the value is unreliable
+  // cannot properly track the number of bits set so set isDirty = true
   void setBit(final long index) {
     data_[(int) index >>> 6] |= 1L << index;
-    numBitsSet_ = -1; // use as a dirty flag
+    isDirty_ = true;
   }
 
   // returns existing value of bit
@@ -103,7 +108,7 @@ final class BitArray {
       return true; // already seen
     } else {
       data_[offset] |= mask;
-      if (numBitsSet_ != -1) { ++numBitsSet_; }
+      ++numBitsSet_; // increment regardless of isDirty_
       return false; // new set
     }
   }
@@ -112,7 +117,7 @@ final class BitArray {
   // O(1) if only getAndSetBit() has been used
   // O(data_.length) if setBit() has ever been used
   long getNumBitsSet() {
-    if (numBitsSet_ == -1) {
+    if (isDirty_) {
       numBitsSet_ = 0;
       for (long val : data_) {
         numBitsSet_ += Long.bitCount(val);
@@ -137,6 +142,7 @@ final class BitArray {
       numBitsSet += Long.bitCount(data_[i]);
     }
     numBitsSet_ = numBitsSet;
+    isDirty_ = false;
   }
 
   // applies logical AND
@@ -151,29 +157,31 @@ final class BitArray {
       numBitsSet += Long.bitCount(data_[i]);
     }
     numBitsSet_ = numBitsSet;
+    isDirty_ = false;
   }
 
   // applies bitwise inversion
   void invert() {
-    if (numBitsSet_ == -1) {
+    if (isDirty_) {
       numBitsSet_ = 0;
       for (int i = 0; i < data_.length; ++i) {
         data_[i] = ~data_[i];
         numBitsSet_ += Long.bitCount(data_[i]);
       }
+      isDirty_ = false;
     } else {
       for (int i = 0; i < data_.length; ++i) {
         data_[i] = ~data_[i];
       }
-      numBitsSet_ = getCapacity() - numBitsSet_;  
+      numBitsSet_ = getCapacity() - numBitsSet_;
     }
   }
 
   long getSerializedSizeBytes() {
     // We only really need an int for array length but this will keep everything
     // aligned to 8 bytes.
-    // Always write array length, even if empty
-    return isEmpty() ? Long.BYTES : Long.BYTES * (1L + data_.length);
+    // Always write array length and numBitsSet, even if empty
+    return isEmpty() ? Long.BYTES : Long.BYTES * (2L + data_.length);
   }
 
   void writeToMemory(final WritableMemory wmem) {
@@ -184,6 +192,7 @@ final class BitArray {
     wmem.putInt(0, data_.length);
     
     if (!isEmpty()) {
+      wmem.putLong(NUMBITSSET_OFFSET, isDirty_ ? -1 : numBitsSet_);
       wmem.putLongArray(DATA_OFFSET, data_, 0, data_.length);
     }
   }
@@ -212,8 +221,8 @@ final class BitArray {
 
   // clears the array
   void reset() {
-    final int numLongs = data_.length;
-    data_ = new long[numLongs];
+    Arrays.fill(data_, 0);
     numBitsSet_ = 0;
+    isDirty_ = false;
   }
 }
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index 00c6dfe9..33df6e55 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -57,7 +57,7 @@ import org.apache.datasketches.memory.XxHash;
  */
 public final class BloomFilter {
   // maximum number of longs in the array with space for a header at serialization
-  private static final long MAX_SIZE = (Integer.MAX_VALUE - 3) * (long) Long.SIZE;
+  private static final long MAX_SIZE = (Integer.MAX_VALUE - Family.BLOOMFILTER.getMaxPreLongs()) * (long) Long.SIZE;
   private static final int SER_VER = 1;
   private static final int EMPTY_FLAG_MASK = 4;
 
@@ -135,6 +135,13 @@ public final class BloomFilter {
     return new BloomFilter(numHashes, seed, bitArray);
   }
 
+  /**
+   * Resets the BloomFilter to an empty state
+   */
+  public void reset() {
+    bitArray_.reset();
+  }
+
   /**
    * Checks if the BloomFilter has processed any items
    * @return True if the BloomFilter is empty, otherwise False
@@ -648,7 +655,7 @@ public final class BloomFilter {
   }
 
 /*
- * A Bloom Filter's serialized image always uses 3 longs of preamble, whether empty or not:
+ * A Bloom Filter's serialized image always uses 4 longs of preamble, whether empty or not:
  *
  * <pre>
  * Long || Start Byte Adr:
@@ -661,6 +668,9 @@ public final class BloomFilter {
  *
  *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
  *  2   ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ *
+ *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30   |   31   |
+ *  2   ||---------------------------------NumBitsSet------------------------------------|
  *  </pre>
  * 
  * The raw BitArray bits, if non-empty start at byte 24.
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 0934338d..22356330 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -23,6 +23,14 @@ import java.util.concurrent.ThreadLocalRandom;
 
 import org.apache.datasketches.common.SketchesArgumentException;
 
+/**
+ * <p>This class provides methods to help estimate the correct paramters to use when
+ * creating a Bloom filter, and methods to create the filter using those values.</p>
+ * 
+ * <p>The underlying math is described in the
+ * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
+ * Wikipedia article on Bloom filters</a>.</p>
+ */
 public final class BloomFilterBuilder {
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org