You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by jm...@apache.org on 2024/02/27 06:53:52 UTC
(datasketches-java) 05/06: Finish main filter tests, add preamble format
This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch bloom
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit f1aadaa89ee0d98c7181074b4680dfc63cfc50ee
Author: jmalkin <78...@users.noreply.github.com>
AuthorDate: Mon Feb 26 20:50:23 2024 -0800
Finish main filter tests, add preamble format
---
.../filters/bloomfilter/BloomFilter.java | 21 +++-
.../filters/bloomfilter/BloomFilterTest.java | 131 ++++++++++++++++++++-
2 files changed, 150 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index d7919de5..5df2777c 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -647,10 +647,29 @@ public final class BloomFilter {
return sizeBytes;
}
+/*
+ * A Bloom Filter's serialized image always uses 3 longs of preamble, whether empty or not:
+ *
+ * <pre>
+ * Long || Start Byte Adr:
+ * Adr:
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |----Num Hashes---|-----Unused------|
+ *
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
+ * 1 ||---------------------------------Hash Seed-------------------------------------|
+ *
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
+ * 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ * </pre>
+ *
+ * The raw BitArray bits, if non-empty start at byte 24.
+ */
+
/**
* Serializes the current BloomFilter to an array of bytes.
*
- * <p>Note: Method throws if the serialized size exceeds <tt>Integer.MAX_VALUE</tt>.</p>
+ * <p>Note: Method throws if the serialized size exceeds <code>Integer.MAX_VALUE</code>.</p>
* @return A serialized image of the current BloomFilter as byte[]
*/
public byte[] toByteArray() {
diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
index 564fbdab..5c0df995 100644
--- a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
+++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterTest.java
@@ -22,6 +22,7 @@ package org.apache.datasketches.filters.bloomfilter;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.fail;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
@@ -101,6 +102,40 @@ public class BloomFilterTest {
}
+ @Test
+ public void incompatibleSetOperationsTest() {
+ final int numBits = 128;
+ final int numHashes = 4;
+ final BloomFilter bf1 = new BloomFilter(numBits, numHashes);
+
+ // mismatched num bits
+ try {
+ final BloomFilter bf2 = new BloomFilter(numBits * 2, numHashes, bf1.getSeed());
+ bf1.union(bf2);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // mismatched num hashes
+ try {
+ final BloomFilter bf2 = new BloomFilter(numBits, numHashes * 2, bf1.getSeed());
+ bf1.intersect(bf2);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+
+ // mismatched seed
+ try {
+ final BloomFilter bf2 = new BloomFilter(numBits, numHashes, bf1.getSeed() - 1);
+ bf1.union(bf2);
+ fail();
+ } catch (final SketchesArgumentException e) {
+ // expected
+ }
+ }
+
@Test
public void basicUnionTest() {
final long numBits = 12288;
@@ -116,6 +151,7 @@ public class BloomFilterTest {
bf2.queryAndUpdate(n / 2 + i);
}
+ bf1.union(null); // no-op
bf1.union(bf2);
for (int i = 0; i < maxItem; ++i) {
assertTrue(bf1.query(i));
@@ -144,6 +180,7 @@ public class BloomFilterTest {
bf2.queryAndUpdate(n / 2 + i);
}
+ bf1.intersect(null); // no-op
bf1.intersect(bf2);
for (int i = n / 2; i < n; ++i) {
assertTrue(bf1.query(i));
@@ -229,5 +266,97 @@ public class BloomFilterTest {
}
assertEquals(fromLongsCount, n + count); // same numbers of items should match
}
-}
+ @Test
+ public void testBasicUpdateMethods() {
+ final int numDistinct = 100;
+ final double fpp = 1e-6;
+ final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+
+ // empty/null String should do nothing
+ bf.update("");
+ bf.update((String) null);
+ assertFalse(bf.queryAndUpdate(""));
+ assertFalse(bf.queryAndUpdate((String) null));
+ assertEquals(bf.getBitsUsed(), 0);
+
+ bf.update("abc");
+ assertFalse(bf.queryAndUpdate("def"));
+ bf.update(932);
+ assertFalse(bf.queryAndUpdate(543));
+ bf.update(Double.NaN);
+ assertFalse(bf.queryAndUpdate(Double.POSITIVE_INFINITY));
+ assertTrue(bf.getBitsUsed() <= bf.getNumHashes() * 6);
+ assertFalse(bf.isEmpty());
+ }
+
+ @Test
+ public void testArrayUpdateMethods() {
+ // 3 doubles = 24 bytes
+ final double rawData[] = { 1.414, 2.71, 3.1415926538 };
+ final Memory mem = Memory.wrap(rawData);
+
+ final int numDistinct = 100;
+ final double fpp = 1e-6;
+
+ // for each BloomFilter update type, call update() then queryAndUpdate(), where
+ // the latter should return true. query() should likewise return true.
+ // A final intersection should have the same number of bits set as the raw input.
+ final BloomFilter bfMem = BloomFilterBuilder.newBloomFilter(numDistinct, fpp);
+ bfMem.update(mem);
+ assertTrue(bfMem.queryAndUpdate(mem));
+ assertTrue(bfMem.query(mem));
+ final long numBitsSet = bfMem.getBitsUsed();
+ final long seed = bfMem.getSeed();
+
+ final BloomFilter bfBytes = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ final byte[] bytes = new byte[24];
+ mem.getByteArray(0, bytes, 0, 24);
+ bfBytes.update(bytes);
+ assertTrue(bfBytes.queryAndUpdate(bytes));
+ assertTrue(bfBytes.query(bytes));
+ assertEquals(bfBytes.getBitsUsed(), numBitsSet);
+
+ final BloomFilter bfChars = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ final char[] chars = new char[12];
+ mem.getCharArray(0, chars, 0, 12);
+ bfChars.update(chars);
+ assertTrue(bfChars.queryAndUpdate(chars));
+ assertTrue(bfChars.query(chars));
+ assertEquals(bfChars.getBitsUsed(), numBitsSet);
+
+ final BloomFilter bfShorts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ final short[] shorts = new short[12];
+ mem.getShortArray(0, shorts, 0, 12);
+ bfShorts.update(shorts);
+ assertTrue(bfShorts.queryAndUpdate(shorts));
+ assertTrue(bfShorts.query(shorts));
+ assertEquals(bfShorts.getBitsUsed(), numBitsSet);
+
+ final BloomFilter bfInts = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ final int[] ints = new int[6];
+ mem.getIntArray(0, ints, 0, 6);
+ bfInts.update(ints);
+ assertTrue(bfInts.queryAndUpdate(ints));
+ assertTrue(bfInts.query(ints));
+ assertEquals(bfInts.getBitsUsed(), numBitsSet);
+
+ final BloomFilter bfLongs = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ final long[] longs = new long[3];
+ mem.getLongArray(0, longs, 0, 3);
+ bfLongs.update(longs);
+ assertTrue(bfLongs.queryAndUpdate(longs));
+ assertTrue(bfLongs.query(longs));
+ assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+
+ // intersect all the sketches into a new one
+ final BloomFilter bf = BloomFilterBuilder.newBloomFilter(numDistinct, fpp, seed);
+ bf.intersect(bfMem);
+ bf.intersect(bfBytes);
+ bf.intersect(bfChars);
+ bf.intersect(bfShorts);
+ bf.intersect(bfInts);
+ bf.intersect(bfLongs);
+ assertEquals(bfLongs.getBitsUsed(), numBitsSet);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org