You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:52 UTC

(datasketches-java) 05/06: Finish main filter tests, add preamble format

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit f1aadaa89ee0d98c7181074b4680dfc63cfc50ee
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 20:50:23 2024 -0800

    Finish main filter tests, add preamble format
---
 .../filters/bloomfilter/BloomFilter.java           |  21 +++-
 .../filters/bloomfilter/BloomFilterTest.java       | 131 ++++++++++++++++++++-
 2 files changed, 150 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index d7919de5..5df2777c 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -647,10 +647,29 @@ public final class BloomFilter {
     return sizeBytes;
   }
 
+/*
+ * A Bloom Filter's serialized image always uses 3 longs of preamble, whether empty or not:
+ *
+ * <pre>
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
+ *  0   || Preamble_Longs | SerVer | FamID  |  Flags |----Num Hashes---|-----Unused------|
+ *
+ *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |
+ *  1   ||---------------------------------Hash Seed-------------------------------------|
+ *
+ *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
+ *  2   ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ *  </pre>
+ * 
+ * The raw BitArray bits, if non-empty start at byte 24.
+ */
+
   /**
    * Serializes the current BloomFilter to an array of bytes.
    *
-   * <p>Note: Method throws if the serialized size exceeds <tt>Integer.MAX_VALUE</tt>.</p>
+   * <p>Note: Method throws if the serialized size exceeds <code>Integer.MAX_VALUE</code>.</p>
    * @return A serialized image of the current BloomFilter as byte[]
    */
   public byte[] toByteArray() {
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
index 564fbdab..5c0df995 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
@@ -22,6 +22,7 @@ package org.apache.datasketches.filters.bloomfilter;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.fail;
 
 import org.apache.datasketches.common.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
@@ -101,6 +102,40 @@ public class BloomFilterTest {
 
   }
 
+  @Test
+  public void incompatibleSetOperationsTest() {
+    final int numBits = 128;
+    final int numHashes = 4;
+    final BloomFilter bf1 = new BloomFilter(numBits, numHashes);
+
+    // mismatched num bits
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits * 2, numHashes, bf1.getSeed());
+      bf1.union(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // mismatched num hashes
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits, numHashes * 2, bf1.getSeed());
+      bf1.intersect(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+
+    // mismatched seed
+    try {
+      final BloomFilter bf2 = new BloomFilter(numBits, numHashes, bf1.getSeed() - 1);
+      bf1.union(bf2);
+      fail();
+    } catch (final SketchesArgumentException e) {
+      // expected
+    }
+  }
+
   @Test
   public void basicUnionTest() {
     final long numBits = 12288;
@@ -116,6 +151,7 @@ public class BloomFilterTest {
       bf2.queryAndUpdate(n / 2 + i);
     }
     
+    bf1.union(null); // no-op
     bf1.union(bf2);
     for (int i = 0; i < maxItem; ++i) {
       assertTrue(bf1.query(i));
@@ -144,6 +180,7 @@ public class BloomFilterTest {
       bf2.queryAndUpdate(n / 2 + i);
     }
     
+    bf1.intersect(null); // no-op
     bf1.intersect(bf2);
     for (int i = n / 2; i < n; ++i) {
       assertTrue(bf1.query(i));
@@ -229,5 +266,97 @@ public class BloomFilterTest {
     }
     assertEquals(fromLongsCount, n + count); // same numbers of items should match
   }
-}
 
+  @Test
+  public void testBasicUpdateMethods() {
+    final int numDistinct = 100;
+    final double fpp = 1e-6;
+    final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+
+    // empty/null String should do nothing
+    bf.update("");
+    bf.update((String) null);
+    assertFalse(bf.queryAndUpdate(""));
+    assertFalse(bf.queryAndUpdate((String) null));
+    assertEquals(bf.getBitsUsed(), 0);
+
+    bf.update("abc");
+    assertFalse(bf.queryAndUpdate("def"));
+    bf.update(932);
+    assertFalse(bf.queryAndUpdate(543));
+    bf.update(Double.NaN);
+    assertFalse(bf.queryAndUpdate(Double.POSITIVE_INFINITY));
+    assertTrue(bf.getBitsUsed() <= bf.getNumHashes() * 6);
+    assertFalse(bf.isEmpty());
+  }
+
+  @Test
+  public void testArrayUpdateMethods() {
+    // 3 doubles = 24 bytes
+    final double rawData[] = { 1.414, 2.71, 3.1415926538 };
+    final Memory mem = Memory.wrap(rawData);
+
+    final int numDistinct = 100;
+    final double fpp = 1e-6;
+
+    // for each BloomFilter update type, call update() then queryAndUpdate(), where
+    // the latter should return true. query() should likewise return true.
+    // A final intersection should have the same number of bits set as the raw input.
+    final BloomFilter bfMem = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+    bfMem.update(mem);
+    assertTrue(bfMem.queryAndUpdate(mem));
+    assertTrue(bfMem.query(mem));
+    final long numBitsSet = bfMem.getBitsUsed();
+    final long seed = bfMem.getSeed();
+
+    final BloomFilter bfBytes = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final byte[] bytes = new byte[24];
+    mem.getByteArray(0, bytes, 0, 24);
+    bfBytes.update(bytes);
+    assertTrue(bfBytes.queryAndUpdate(bytes));
+    assertTrue(bfBytes.query(bytes));
+    assertEquals(bfBytes.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfChars = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final char[] chars = new char[12];
+    mem.getCharArray(0, chars, 0, 12);
+    bfChars.update(chars);
+    assertTrue(bfChars.queryAndUpdate(chars));
+    assertTrue(bfChars.query(chars));
+    assertEquals(bfChars.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfShorts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final short[] shorts = new short[12];
+    mem.getShortArray(0, shorts, 0, 12);
+    bfShorts.update(shorts);
+    assertTrue(bfShorts.queryAndUpdate(shorts));
+    assertTrue(bfShorts.query(shorts));
+    assertEquals(bfShorts.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfInts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final int[] ints = new int[6];
+    mem.getIntArray(0, ints, 0, 6);
+    bfInts.update(ints);
+    assertTrue(bfInts.queryAndUpdate(ints));
+    assertTrue(bfInts.query(ints));
+    assertEquals(bfInts.getBitsUsed(), numBitsSet);
+
+    final BloomFilter bfLongs = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    final long[] longs = new long[3];
+    mem.getLongArray(0, longs, 0, 3);
+    bfLongs.update(longs);
+    assertTrue(bfLongs.queryAndUpdate(longs));
+    assertTrue(bfLongs.query(longs));
+    assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+
+    // intersect all the sketches into a new one
+    final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+    bf.intersect(bfMem);
+    bf.intersect(bfBytes);
+    bf.intersect(bfChars);
+    bf.intersect(bfShorts);
+    bf.intersect(bfInts);
+    bf.intersect(bfLongs);
+    assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org