You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2022/11/30 16:30:37 UTC

[commons-collections] branch master updated: Collections-817: Update estimateN, estimateIntersection and estimateUnion

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git


The following commit(s) were added to refs/heads/master by this push:
     new a1de78717 Collections-817: Update estimateN, estimateIntersection and estimateUnion
a1de78717 is described below

commit a1de787172664a63fd172571fc9779137568ece4
Author: aherbert <ah...@apache.org>
AuthorDate: Wed Nov 30 16:30:15 2022 +0000

    Collections-817: Update estimateN, estimateIntersection and
    estimateUnion
    
    Reworked calculations and updated javadoc.
    
    Added documentation concerning n, intersection, and union calculations
    in Shape class.
    
    Modified estimatedIntersection to account for infinite values in
    calculations.
    
    This commit addresses a merge error that omitted all fixes  intended for
    the changes in 64ed061e6a2a89f251ebd26dd52650d56025b5c1
---
 .../collections4/bloomfilter/BloomFilter.java      | 75 +++++++++++++++----
 .../commons/collections4/bloomfilter/Shape.java    | 42 +++++++++++
 .../bloomfilter/SimpleBloomFilter.java             |  4 --
 .../bloomfilter/AbstractBitMapProducerTest.java    | 12 ++++
 .../bloomfilter/AbstractBloomFilterTest.java       | 27 +++++--
 .../bloomfilter/DefaultBitMapProducerTest.java     | 18 +++++
 .../bloomfilter/DefaultBloomFilterTest.java        | 83 +++++++++++++++++++++-
 .../bloomfilter/SimpleBloomFilterTest.java         | 17 +++++
 8 files changed, 255 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
index ec36d63c7..d256a927b 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
@@ -150,6 +150,7 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
      *
      * @param hasher The hasher to merge.
      * @return true if the merge was successful
+     * @throws IllegalArgumentException if hasher produces an illegal value.
      */
     default boolean merge(final Hasher hasher) {
         Objects.requireNonNull(hasher, "hasher");
@@ -214,26 +215,48 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
      * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
      * shape and cardinality of this filter.</p>
      *
-     * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter.</p>
+     * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
+     * by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
      *
-     * @return an estimate of the number of items in the bloom filter.
+     * <p><em>Note:</em></p>
+     * <ul>
+     * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
+     * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
+     * </ul>
+     *
+     * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
+     * estimate is larger than Integer.MAX_VALUE.
+     * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
      * @see Shape#estimateN(int)
+     * @see Shape
      */
     default int estimateN() {
-        return (int) Math.round(getShape().estimateN(cardinality()));
+        double d = getShape().estimateN(cardinality());
+        if (Double.isInfinite(d)) {
+            return Integer.MAX_VALUE;
+        }
+        if (Double.isNaN(d)) {
+            throw new IllegalArgumentException("Cardinality too large: " + cardinality());
+        }
+        long l = Math.round(d);
+        return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
     }
 
     /**
      * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
      *
-     * <p>By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.</p>
-     *
      * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
-     * of the filters.</p>
+     * of the filters by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
+     *
+     * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
+     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
+     * may be thrown.</em></p>
      *
      * @param other The other Bloom filter
-     * @return an estimate of the number of items in the union.
+     * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
+     * estimate is larger than Integer.MAX_VALUE.
      * @see #estimateN()
+     * @see Shape
      */
     default int estimateUnion(final BloomFilter other) {
         Objects.requireNonNull(other, "other");
@@ -245,16 +268,44 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
     /**
      * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
      *
-     * <p>By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)} </p>
+     * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
+     * of the filters by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
      *
-     * <p>This produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
-     * of the filters.</p>
+     * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
+     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
+     * may be thrown.</em></p>
      *
      * @param other The other Bloom filter
-     * @return an estimate of the number of items in the intersection.
+     * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
+     * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
+     * @see #estimateN()
+     * @see Shape
      */
     default int estimateIntersection(final BloomFilter other) {
         Objects.requireNonNull(other, "other");
-        return estimateN() + other.estimateN() - estimateUnion(other);
+        double eThis = getShape().estimateN(cardinality());
+        double eOther = getShape().estimateN(other.cardinality());
+        if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
+            // if both are infinite the union is infinite and we return Integer.MAX_VALUE
+            return Integer.MAX_VALUE;
+        }
+        long estimate;
+        // if one is infinite the intersection is the other.
+        if (Double.isInfinite(eThis)) {
+            estimate = Math.round(eOther);
+        } else if (Double.isInfinite(eOther)) {
+            estimate = Math.round(eThis);
+        } else {
+            BloomFilter union = this.copy();
+            union.merge(other);
+            double eUnion = getShape().estimateN(union.cardinality());
+            if (Double.isInfinite(eUnion)) {
+                throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
+            }
+            // maximum estimate value using integer values is: 46144189292 thus
+            // eThis + eOther can not overflow the long value.
+            estimate = Math.round(eThis + eOther - eUnion);
+        }
+        return estimate>Integer.MAX_VALUE?Integer.MAX_VALUE:(int) estimate;
     }
 }
diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java
index 9b903e9ce..883208f06 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java
@@ -36,6 +36,48 @@ package org.apache.commons.collections4.bloomfilter;
  * <dd>{@code k = round((m / n) * ln(2))}</dd>
  * </dl>
  *
+ * <h2>Estimations from cardinality based on shape</h2>
+ *
+ * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
+ *
+ * <p>In the calculation below the following values are used:</p>
+ * <ul>
+ * <li>double c = the cardinality of the Bloom filter.</li>
+ * <li>double m = numberOfBits as specified in the shape.</li>
+ * <li>double k = numberOfHashFunctions as specified in the shape.</li>
+ * </ul>
+ *
+ * <h3>Estimate N - n()</h3>
+ *
+ * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}.  This is the calculation
+ * performed by the {@code Shape.estimateN(cardinality)} method below.  This estimate is roughly equivalent to the
+ * number of hashers that have been merged into a filter to create the cardinality specified.</p>
+ *
+ * <p><em>Note:</em></p>
+ * <ul>
+ * <li>if cardinality == numberOfBits, then result is infinity.</li>
+ * <li>if cardinality &gt; numberOfBits, then result is NaN.</li>
+ * </ul>
+ *
+ * <h3>Estimate N of Union - n(A &cup; B)</h3>
+ *
+ * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
+ * calculate the estimated N from the result.</p>
+ *
+ * <h3>Estimate N of the Intersection - n(A &cap; B)</h3>
+ *
+ * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
+ * n(A) + n(b) - n(A &cup; B).</p>
+ *
+ * <p>Care must be taken when any of the n(x) returns infinity.  In general the following assumptions are true:
+ *
+ * <ul>
+ * <li>If n(A) = &infin; and n(B) &lt; &infin; then n(A &cap; B) = n(B)</li>
+ * <li>If n(A) &lt; &infin; and n(B) = &infin; then n(A &cap; B) = n(A)</li>
+ * <li>If n(A) = &infin; and n(B) = &infin; then n(A &cap; B) = &infin;</li>
+ * <li>If n(A) &lt; &infin; and n(B) &lt; &infin; and n(A &cup; B) = &infin; then n(A &cap; B) is undefined.</li>
+ * </ul>
+ *
  * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
  * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
  * [Wikipedia]</a>
diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
index 6793d223b..25114c113 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
@@ -114,10 +114,6 @@ public final class SimpleBloomFilter implements BloomFilter {
             // idx[0] will be limit+1 so decrement it
             idx[0]--;
             final int idxLimit = BitMap.getLongIndex(shape.getNumberOfBits());
-            if (idxLimit < idx[0]) {
-                throw new IllegalArgumentException(String.format(
-                        "BitMapProducer set a bit higher than the limit for the shape: %s", shape.getNumberOfBits() - 1));
-            }
             if (idxLimit == idx[0]) {
                 final long excess = bitMap[idxLimit] >> shape.getNumberOfBits();
                 if (excess != 0) {
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java
index 956712bbf..ec3832b75 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitMapProducerTest.java
@@ -103,6 +103,18 @@ public abstract class AbstractBitMapProducerTest {
         Arrays.fill(count, 0);
         createProducer().forEachBitMapPair(createEmptyProducer(), lbp);
         assertEquals(count[2], count[1]);
+
+        // test where the created producer does not process all records because the predicate function
+        // returns false before the processing is completed.
+        int[] limit = new int[1];
+        final LongBiPredicate shortFunc =  (x, y) -> {
+            limit[0]++;
+            return limit[0] < 2;
+        };
+        final BitMapProducer shortProducer = l -> {
+            return true;
+        };
+        assertFalse(createProducer().forEachBitMapPair(shortProducer, shortFunc));
     }
 
     @Test
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java
index e525aca8b..7211779e4 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java
@@ -215,7 +215,7 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
     }
 
     /**
-     * Tests that the andCardinality calculations are correct.
+     * Tests that the estimated intersection calculations are correct.
      */
     @Test
     public final void testEstimateIntersection() {
@@ -223,14 +223,27 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
         final BloomFilter bf = createFilter(getTestShape(), TestingHashers.FROM1);
         final BloomFilter bf2 = TestingHashers.populateFromHashersFrom1AndFrom11(createEmptyFilter(getTestShape()));
 
+        final BloomFilter bf3 = TestingHashers.populateEntireFilter(createEmptyFilter(getTestShape()));
 
         assertEquals(1, bf.estimateIntersection(bf2));
         assertEquals(1, bf2.estimateIntersection(bf));
+        assertEquals(1, bf.estimateIntersection(bf3));
+        assertEquals(1, bf2.estimateIntersection(bf));
+        assertEquals(2, bf3.estimateIntersection(bf2));
 
-        final BloomFilter bf3 = createEmptyFilter(getTestShape());
+        final BloomFilter bf4 = createEmptyFilter(getTestShape());
+
+        assertEquals(0, bf.estimateIntersection(bf4));
+        assertEquals(0, bf4.estimateIntersection(bf));
 
-        assertEquals(0, bf.estimateIntersection(bf3));
-        assertEquals(0, bf3.estimateIntersection(bf));
+        BloomFilter bf5 = TestingHashers.mergeHashers(createEmptyFilter(getTestShape()), new IncrementingHasher(0, 1)/* 0-16 */,
+                new IncrementingHasher(17, 1)/* 17-33 */, new IncrementingHasher(33, 1)/* 33-49 */);
+        BloomFilter bf6 = TestingHashers.mergeHashers(createEmptyFilter(getTestShape()), new IncrementingHasher(50, 1)/* 50-66 */,
+                new IncrementingHasher(67, 1)/* 67-83 */);
+        assertThrows(IllegalArgumentException.class, () -> bf5.estimateIntersection(bf6));
+
+        // infinite with infinite
+        assertEquals(Integer.MAX_VALUE, bf3.estimateIntersection(bf3));
     }
 
     /**
@@ -256,18 +269,20 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
     @Test
     public final void testEstimateN() {
         // build a filter
-        final BloomFilter filter1 = createFilter(getTestShape(), TestingHashers.FROM1);
+        BloomFilter filter1 = createFilter(getTestShape(), TestingHashers.FROM1);
         assertEquals(1, filter1.estimateN());
 
         // the data provided above do not generate an estimate that is equivalent to the
         // actual.
         filter1.merge(new IncrementingHasher(4, 1));
-
         assertEquals(1, filter1.estimateN());
 
         filter1.merge(new IncrementingHasher(17, 1));
 
         assertEquals(3, filter1.estimateN());
+
+        filter1 = TestingHashers.populateEntireFilter(createEmptyFilter(getTestShape()));
+        assertEquals(Integer.MAX_VALUE, filter1.estimateN());
     }
 
     /**
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java
index e2c54341f..d0bee54c7 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitMapProducerTest.java
@@ -87,4 +87,22 @@ public class DefaultBitMapProducerTest extends AbstractBitMapProducerTest {
         final long[] ary = BitMapProducer.fromBitMapArray(expected).asBitMapArray();
         assertArrayEquals(expected, ary);
     }
+
+    @Test
+    public void testAsBitMapArrayLargeArray() {
+        final long[] expected = generateLongArray(32);
+        BitMapProducer producer = new BitMapProducer() {
+            @Override
+            public boolean forEachBitMap(LongPredicate predicate) {
+                for (long l : expected) {
+                    if (!predicate.test(l)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+        final long[] ary = producer.asBitMapArray();
+        assertArrayEquals(expected, ary);
+    }
 }
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java
index 7ccbda945..cb8e43225 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java
@@ -18,6 +18,7 @@ package org.apache.commons.collections4.bloomfilter;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
 import java.util.TreeSet;
 import java.util.function.IntPredicate;
@@ -67,6 +68,74 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
                 .forEachBitMapPair(bf1, (x, y) -> x == y));
     }
 
+    @Test
+    public void testEstimateNWithBrokenCardinality() {
+        // build a filter
+        BloomFilter filter1 = TestingHashers.populateEntireFilter(new BrokenCardinality(getTestShape()));
+        assertThrows(IllegalArgumentException.class, () -> filter1.estimateN());
+    }
+
+    @Test
+    public void testEstimateLargeN() {
+        Shape s = Shape.fromKM(1, Integer.MAX_VALUE);
+        // create a very large filter with Integer.MAX_VALUE-1 bits set.
+        BloomFilter bf1 = new SimpleBloomFilter(s);
+        bf1.merge((BitMapProducer) predicate -> {
+            int limit = Integer.MAX_VALUE - 1;
+            while (limit > 64) {
+                predicate.test(0xFFFFFFFFFFFFFFFFL);
+                limit -= 64;
+            }
+            long last = 0L;
+            for (int i = 0; i < limit; i++) {
+                last |= BitMap.getLongBit(i);
+            }
+            predicate.test(last);
+            return true;
+        });
+        // the actual result of the calculation is: 46144189292, so the returned value
+        // should be Integer.MAX_VALUE.
+        assertEquals(Integer.MAX_VALUE, bf1.estimateN());
+    }
+
+    @Test
+    public void testIntersectionLimit() {
+        Shape s = Shape.fromKM(1, Integer.MAX_VALUE);
+        // create a very large filter with Integer.MAX_VALUE-1 bit set.
+        BloomFilter bf1 = new SimpleBloomFilter(s);
+        bf1.merge((BitMapProducer) predicate -> {
+            int limit = Integer.MAX_VALUE - 1;
+            while (limit > 64) {
+                predicate.test(0xFFFFFFFFFFFFFFFFL);
+                limit -= 64;
+            }
+            long last = 0L;
+            for (int i = 0; i < limit; i++) {
+                last |= BitMap.getLongBit(i);
+            }
+            predicate.test(last);
+            return true;
+        });
+        // the actual result of the calculation is: 46144189292
+        assertEquals(Integer.MAX_VALUE, bf1.estimateIntersection(bf1));
+    }
+
+    @Test
+    public void testSparseNonSparseMerging() {
+        BloomFilter bf1 = new SparseDefaultBloomFilter(getTestShape());
+        bf1.merge(TestingHashers.FROM1);
+        BloomFilter bf2 = new NonSparseDefaultBloomFilter(getTestShape());
+        bf2.merge(TestingHashers.FROM11);
+
+        BloomFilter result = bf1.copy();
+        result.merge(bf2);
+        assertEquals(27, result.cardinality());
+
+        result = bf2.copy();
+        result.merge(bf1);
+        assertEquals(27, result.cardinality());
+    }
+
     abstract static class AbstractDefaultBloomFilter implements BloomFilter {
         private final Shape shape;
         protected TreeSet<Integer> indices;
@@ -178,9 +247,21 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
 
         @Override
         public AbstractDefaultBloomFilter copy() {
-            final AbstractDefaultBloomFilter result = new SparseDefaultBloomFilter(getShape());
+            final AbstractDefaultBloomFilter result = new NonSparseDefaultBloomFilter(getShape());
             result.indices.addAll(indices);
             return result;
         }
     }
+
+    static class BrokenCardinality extends NonSparseDefaultBloomFilter {
+
+        BrokenCardinality(Shape shape) {
+            super(shape);
+        }
+
+        @Override
+        public int cardinality() {
+            return super.cardinality() + 1;
+        }
+    }
 }
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java
index b4b4014ee..f58828afc 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java
@@ -16,6 +16,11 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
 /**
  * Tests for the {@link SimpleBloomFilter}.
  */
@@ -24,4 +29,16 @@ public class SimpleBloomFilterTest extends AbstractBloomFilterTest<SimpleBloomFi
     protected SimpleBloomFilter createEmptyFilter(final Shape shape) {
         return new SimpleBloomFilter(shape);
     }
+
+    @Test
+    public void testMergeShortBitMapProducer() {
+        SimpleBloomFilter filter = createEmptyFilter(getTestShape());
+        // create a producer that returns too few values
+        // shape expects 2 longs we are sending 1.
+        BitMapProducer producer = p -> {
+            return p.test(2L);
+        };
+        assertTrue(filter.merge(producer));
+        assertEquals(1, filter.cardinality());
+    }
 }