You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2020/02/27 20:48:18 UTC

[commons-collections] branch master updated (e08f0be -> 8b4ecbc)

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git.


    from e08f0be  Revert CountingBloomFilter to ignore counts from another filter.
     new 9a496dc  Update the AbstractBloomFilter to not use BitSet for cardinality.
     new 24ad759  Document the HashFunctionIdentity Signedness
     new 8b4ecbc  Compute the bit index into a Bloom filter using bit shifts.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../bloomfilter/AbstractBloomFilter.java           | 84 +++++++++++-----------
 .../bloomfilter/BloomFilterIndexer.java            | 81 +++++++++++++++++++++
 .../bloomfilter/HasherBloomFilter.java             |  9 ++-
 .../bloomfilter/hasher/HashFunctionIdentity.java   | 28 +++++++-
 .../bloomfilter/BloomFilterIndexerTest.java        | 54 ++++++++++++++
 5 files changed, 208 insertions(+), 48 deletions(-)
 create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java
 create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java


[commons-collections] 02/03: Document the HashFunctionIdentity Signedness

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git

commit 24ad759b3100ef2026381fcb4065613a9dff6117
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Mon Feb 24 23:27:55 2020 +0000

    Document the HashFunctionIdentity Signedness
---
 .../bloomfilter/hasher/HashFunctionIdentity.java   | 28 +++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java
index 0047ad1..83f2f90 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java
@@ -56,9 +56,35 @@ public interface HashFunctionIdentity {
 
     /**
      * Identifies the signedness of the calculations for this function.
+     * <p>
+     * When the hash function executes it typically returns an array of bytes.
+     * That array is converted into one or more numerical values which will be provided
+     * as a {@code long} primitive type.
+     * The signedness identifies if those {@code long} values are signed or unsigned.
+     * For example a hash function that outputs only 32-bits can be unsigned if converted
+     * using {@link Integer#toUnsignedLong(int)}. A hash function that outputs more than
+     * 64-bits is typically signed.
+     * </p>
      */
     enum Signedness {
-        SIGNED, UNSIGNED
+        /**
+         * The result of {@link HashFunction#apply(byte[], int)} is signed,
+         * thus the sign bit may be set.
+         *
+         * <p>The result can be used with {@code Math.floorMod(x, y)} to generate a positive
+         * value if y is positive.
+         *
+         * @see Math#floorMod(int, int)
+         */
+        SIGNED,
+        /**
+         * The result of {@link HashFunction#apply(byte[], int)} is unsigned,
+         * thus the sign bit is never set.
+         *
+         * <p>The result can be used with {@code x % y} to generate a positive
+         * value if y is positive.
+         */
+        UNSIGNED
     }
 
     /**


[commons-collections] 03/03: Compute the bit index into a Bloom filter using bit shifts.

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git

commit 8b4ecbc084eaf8e61f002612f7ca156fcc1e107e
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Tue Feb 25 00:23:27 2020 +0000

    Compute the bit index into a Bloom filter using bit shifts.
    
    Removes the use of Math.floorMod and integer division.
    
    Operations have been put into a class for reuse among filters.
---
 .../bloomfilter/AbstractBloomFilter.java           |  6 +-
 .../bloomfilter/BloomFilterIndexer.java            | 81 ++++++++++++++++++++++
 .../bloomfilter/HasherBloomFilter.java             |  9 ++-
 .../bloomfilter/BloomFilterIndexerTest.java        | 54 +++++++++++++++
 4 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
index 9bbeb43..8a6b948 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
@@ -117,9 +117,9 @@ public abstract class AbstractBloomFilter implements BloomFilter {
         final OfInt iter = hasher.getBits(shape);
         while (iter.hasNext()) {
             final int idx = iter.nextInt();
-            final int buffIdx = idx / Long.SIZE;
-            final int pwr = Math.floorMod(idx, Long.SIZE);
-            final long buffOffset = 1L << pwr;
+            BloomFilterIndexer.checkPositive(idx);
+            final int buffIdx = BloomFilterIndexer.getLongIndex(idx);
+            final long buffOffset = BloomFilterIndexer.getLongBit(idx);
             if ((buff[buffIdx] & buffOffset) == 0) {
                 return false;
             }
diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java
new file mode 100644
index 0000000..9bed48b
--- /dev/null
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+/**
+ * Contains functions to convert {@code int} indices into Bloom filter bit positions.
+ */
+final class BloomFilterIndexer {
+    /** A bit shift to apply to an integer to divided by 64 (2^6). */
+    private static final int DIVIDE_BY_64 = 6;
+
+    /** Do not instantiate. */
+    private BloomFilterIndexer() {}
+
+    /**
+     * Check the index is positive.
+     *
+     * @param bitIndex the bit index
+     * @throws IndexOutOfBoundsException if the index is not positive
+     */
+    static void checkPositive(int bitIndex) {
+        if (bitIndex < 0) {
+            throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex);
+        }
+    }
+
+    /**
+     * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs
+     * to store bits starting at index 0.
+     *
+     * <p>The index is assumed to be positive. For a positive index the result will match
+     * {@code bitIndex / 64}.
+     *
+     * <p>The divide is performed using bit shifts. If the input is negative the behaviour
+     * is not defined.
+     *
+     * @param bitIndex the bit index (assumed to be positive)
+     * @return the filter index
+     * @see #checkPositive(int)
+     */
+    static int getLongIndex(int bitIndex) {
+        // Use a signed shift. Any negative index will produce a negative value
+        // by sign-extension and if used as an index into an array it will throw an exception.
+        return bitIndex >> DIVIDE_BY_64;
+    }
+
+    /**
+     * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit
+     * longs to store bits starting at index 0. The returned value is a {@code long} with only
+     * 1 bit set.
+     *
+     * <p>The index is assumed to be positive. For a positive index the result will match
+     * {@code 1L << (bitIndex % 64)}.
+     *
+     * <p>If the input is negative the behaviour is not defined.
+     *
+     * @param bitIndex the bit index (assumed to be positive)
+     * @return the filter bit
+     * @see #checkPositive(int)
+     */
+    static long getLongBit(int bitIndex) {
+        // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this
+        // using 0x3f (63) or compute bitIndex % 64.
+        // If the index is negative the shift will be in the opposite direction.
+        return 1L << bitIndex;
+    }
+}
diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java
index 8390144..e6436a8 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java
@@ -101,11 +101,10 @@ public class HasherBloomFilter extends AbstractBloomFilter {
         final long[] result = new long[n];
         final OfInt iter = hasher.getBits(hasher.getShape());
         iter.forEachRemaining((IntConsumer) idx -> {
-            long buff = result[idx / Long.SIZE];
-            final long pwr = Math.floorMod(idx, Long.SIZE);
-            final long buffOffset = 1L << pwr;
-            buff |= buffOffset;
-            result[idx / Long.SIZE] = buff;
+            BloomFilterIndexer.checkPositive(idx);
+            final int buffIdx = BloomFilterIndexer.getLongIndex(idx);
+            final long buffOffset = BloomFilterIndexer.getLongBit(idx);
+            result[buffIdx] |= buffOffset;
         });
 
         int limit = result.length;
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java
new file mode 100644
index 0000000..a552706
--- /dev/null
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * Tests for the {@link BloomFilterIndexer}.
+ */
+public class BloomFilterIndexerTest {
+
+    @Test(expected = IndexOutOfBoundsException.class)
+    public void testCheckPositiveThrows() {
+        BloomFilterIndexer.checkPositive(-1);
+    }
+
+    @Test
+    public void testGetLongIndex() {
+        final Random rng = ThreadLocalRandom.current();
+        for (int i = 0; i < 10; i++) {
+            // Random positive number
+            final int index = rng.nextInt() >>> 1;
+            Assert.assertEquals(index / Long.SIZE, BloomFilterIndexer.getLongIndex(index));
+        }
+    }
+
+    @Test
+    public void testGetLongBit() {
+        final Random rng = ThreadLocalRandom.current();
+        for (int i = 0; i < 10; i++) {
+            // Random positive number
+            final int index = rng.nextInt() >>> 1;
+            Assert.assertEquals(1L << (index % Long.SIZE), BloomFilterIndexer.getLongBit(index));
+        }
+    }
+}


[commons-collections] 01/03: Update the AbstractBloomFilter to not use BitSet for cardinality.

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git

commit 9a496dc61cee95257b3a266a4d52138bea7f7e9a
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Mon Feb 24 23:01:39 2020 +0000

    Update the AbstractBloomFilter to not use BitSet for cardinality.
    
    The cardinality can be performed without memory allocation using
    Long.bitCount.
---
 .../bloomfilter/AbstractBloomFilter.java           | 78 +++++++++++-----------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
index e568bf5..9bbeb43 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java
@@ -16,8 +16,8 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
-import java.util.BitSet;
 import java.util.PrimitiveIterator.OfInt;
+import java.util.function.LongBinaryOperator;
 
 import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
 import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
@@ -66,11 +66,11 @@ public abstract class AbstractBloomFilter implements BloomFilter {
         final long[] mine = getBits();
         final long[] theirs = other.getBits();
         final int limit = Integer.min(mine.length, theirs.length);
-        final long[] result = new long[limit];
+        int count = 0;
         for (int i = 0; i < limit; i++) {
-            result[i] = mine[i] & theirs[i];
+            count += Long.bitCount(mine[i] & theirs[i]);
         }
-        return BitSet.valueOf(result).cardinality();
+        return count;
     }
 
     /**
@@ -80,7 +80,11 @@ public abstract class AbstractBloomFilter implements BloomFilter {
      */
     @Override
     public int cardinality() {
-        return BitSet.valueOf(getBits()).cardinality();
+        int count = 0;
+        for (final long bits : getBits()) {
+            count += Long.bitCount(bits);
+        }
+        return count;
     }
 
     /**
@@ -182,27 +186,8 @@ public abstract class AbstractBloomFilter implements BloomFilter {
 
     @Override
     public int orCardinality(final BloomFilter other) {
-        verifyShape(other);
-        final long[] mine = getBits();
-        final long[] theirs = other.getBits();
-        long[] remainder = null;
-        long[] result = null;
-        if (mine.length > theirs.length) {
-            result = new long[mine.length];
-            remainder = mine;
-        } else {
-            result = new long[theirs.length];
-            remainder = theirs;
-
-        }
-        final int limit = Integer.min(mine.length, theirs.length);
-        for (int i = 0; i < limit; i++) {
-            result[i] = mine[i] | theirs[i];
-        }
-        if (limit < result.length) {
-            System.arraycopy(remainder, limit, result, limit, result.length - limit);
-        }
-        return BitSet.valueOf(result).cardinality();
+        // Logical OR
+        return opCardinality(other, (a, b) -> a | b);
     }
 
     /**
@@ -253,26 +238,41 @@ public abstract class AbstractBloomFilter implements BloomFilter {
      */
     @Override
     public int xorCardinality(final BloomFilter other) {
+        // Logical XOR
+        return opCardinality(other, (a, b) -> a ^ b);
+    }
+
+    /**
+     * Perform the operation on the matched longs from this filter and the other filter
+     * and count the cardinality.
+     *
+     * <p>The remaining unmatched longs from the larger filter are always counted. This
+     * method is suitable for OR and XOR cardinality.
+     *
+     * @param other the other Bloom filter.
+     * @param operation the operation (e.g. OR, XOR)
+     * @return the cardinality
+     */
+    private int opCardinality(final BloomFilter other, LongBinaryOperator operation) {
         verifyShape(other);
         final long[] mine = getBits();
         final long[] theirs = other.getBits();
-        long[] remainder = null;
-        long[] result = null;
+        long[] small;
+        long[] big;
         if (mine.length > theirs.length) {
-            result = new long[mine.length];
-            remainder = mine;
+            big = mine;
+            small = theirs;
         } else {
-            result = new long[theirs.length];
-            remainder = theirs;
-
+            small = mine;
+            big = theirs;
         }
-        final int limit = Integer.min(mine.length, theirs.length);
-        for (int i = 0; i < limit; i++) {
-            result[i] = mine[i] ^ theirs[i];
+        int count = 0;
+        for (int i = 0; i < small.length; i++) {
+            count += Long.bitCount(operation.applyAsLong(small[i], big[i]));
         }
-        if (limit < result.length) {
-            System.arraycopy(remainder, limit, result, limit, result.length - limit);
+        for (int i = small.length; i < big.length; i++) {
+            count += Long.bitCount(big[i]);
         }
-        return BitSet.valueOf(result).cardinality();
+        return count;
     }
 }