You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/02/01 14:02:51 UTC
[lucene] 02/02: Reduce bloom filter size by using the optimal count for hash functions. (#11900)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit d1e4a3a44d8918262def84ae3363857772ea74b7
Author: Jean-François BOEUF <je...@gmail.com>
AuthorDate: Wed Feb 1 14:35:50 2023 +0100

    Reduce bloom filter size by using the optimal count for hash functions. (#11900)
---
 lucene/CHANGES.txt                                 |   5 +-
 .../lucene/codecs/bloom/BloomFilterFactory.java    |   4 +-
 .../codecs/bloom/BloomFilteringPostingsFormat.java |  13 ++-
 .../codecs/bloom/DefaultBloomFilterFactory.java    |   2 +-
 .../org/apache/lucene/codecs/bloom/FuzzySet.java   | 129 +++++++++++----------
 .../apache/lucene/codecs/bloom/HashFunction.java   |   2 +-
 .../apache/lucene/codecs/bloom/MurmurHash2.java    | 101 ----------------
 .../apache/lucene/codecs/bloom/MurmurHash64.java   |  85 ++++++++++++++
 8 files changed, 166 insertions(+), 175 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4d7f38d7678..c763b467d18 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -19,7 +19,10 @@ Improvements
 
 Optimizations
 ---------------------
-(No changes)
+
+* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions
+  in order to achieve the same false positive probability with less memory.
+  (Jean-François Boeuf)
 
 Bug Fixes
 ---------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
index 69f31f19585..910f59a5ab9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
@@ -42,9 +42,7 @@ public abstract class BloomFilterFactory {
    * @return null or a hopefully more densely packed, smaller bitset
    */
   public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
-    // Aim for a bitset size that would have 10% of bits set (so 90% of searches
-    // would fail-fast)
-    float targetMaxSaturation = 0.1f;
+    float targetMaxSaturation = initialSet.getTargetMaxSaturation();
     return initialSet.downsize(targetMaxSaturation);
   }
 
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
index 52a2e838261..7e5bf64fbbe 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
@@ -53,7 +53,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
  *
  * <p>A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a
  * per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a
- * ~8mb bitset and hashes values using {@link MurmurHash2}. This should be suitable for most
+ * ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most
  * purposes.
  *
  * <p>The format of the blm file is as follows:
@@ -83,8 +83,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
   /** Extension of Bloom Filters file */
   static final String BLOOM_EXTENSION = "blm";
 
-  BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
-  private PostingsFormat delegatePostingsFormat;
+  private final BloomFilterFactory bloomFilterFactory;
+  private final PostingsFormat delegatePostingsFormat;
 
   /**
    * Creates Bloom filters for a selection of fields created in the index. This is recorded as a set
@@ -120,7 +120,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
   // Used only by core Lucene at read-time via Service Provider instantiation -
   // do not use at Write-time in application code.
   public BloomFilteringPostingsFormat() {
-    super(BLOOM_CODEC_NAME);
+    this(null, new DefaultBloomFilterFactory());
   }
 
   @Override
@@ -366,6 +366,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
       public ImpactsEnum impacts(int flags) throws IOException {
         return delegate().impacts(flags);
       }
+
+      @Override
+      public String toString() {
+        return getClass().getSimpleName() + "(filter=" + filter.toString() + ")";
+      }
     }
 
     @Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
index 61e60dcbca1..306fdb9b4f9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
@@ -31,7 +31,7 @@ public class DefaultBloomFilterFactory extends BloomFilterFactory {
   public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
     // Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set
     // with 10% of bits set
-    return FuzzySet.createSetBasedOnQuality(state.segmentInfo.maxDoc(), 0.10f);
+    return FuzzySet.createOptimalSet(state.segmentInfo.maxDoc(), 0.1023f);
   }
 
   @Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
index 5ccb04203da..f1d2dee65c7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
@@ -44,21 +44,6 @@ import org.apache.lucene.util.RamUsageEstimator;
  */
 public class FuzzySet implements Accountable {
 
-  public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI
-  public static final int VERSION_START = VERSION_SPI;
-  public static final int VERSION_CURRENT = 2;
-
-  public static HashFunction hashFunctionForVersion(int version) {
-    if (version < VERSION_START) {
-      throw new IllegalArgumentException(
-          "Version " + version + " is too old, expected at least " + VERSION_START);
-    } else if (version > VERSION_CURRENT) {
-      throw new IllegalArgumentException(
-          "Version " + version + " is too new, expected at most " + VERSION_CURRENT);
-    }
-    return MurmurHash2.INSTANCE;
-  }
-
   /**
    * Result from {@link FuzzySet#contains(BytesRef)}: can never return definitively YES (always
    * MAYBE), but can sometimes definitely return NO.
@@ -71,6 +56,7 @@ public class FuzzySet implements Accountable {
   private HashFunction hashFunction;
   private FixedBitSet filter;
   private int bloomSize;
+  private final int hashCount;
 
   // The sizes of BitSet used are all numbers that, when expressed in binary form,
   // are all ones. This is to enable fast downsizing from one bitset to another
@@ -82,12 +68,9 @@ public class FuzzySet implements Accountable {
   static final int[] usableBitSetSizes;
 
   static {
-    usableBitSetSizes = new int[30];
-    int mask = 1;
-    int size = mask;
+    usableBitSetSizes = new int[26];
     for (int i = 0; i < usableBitSetSizes.length; i++) {
-      size = (size << 1) | mask;
-      usableBitSetSizes[i] = size;
+      usableBitSetSizes[i] = (1 << (i + 6)) - 1;
     }
   }
 
@@ -131,48 +114,60 @@ public class FuzzySet implements Accountable {
 
   public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) {
     int setSize = getNearestSetSize(maxNumBytes);
-    return new FuzzySet(
-        new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
   }
 
   public static FuzzySet createSetBasedOnQuality(
-      int maxNumUniqueValues, float desiredMaxSaturation) {
+      int maxNumUniqueValues, float desiredMaxSaturation, int version) {
     int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
-    return new FuzzySet(
-        new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
+  }
+
+  public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) {
+    int setSize =
+        (int)
+            Math.ceil(
+                (maxNumUniqueValues * Math.log(targetMaxFpp))
+                    / Math.log(1 / Math.pow(2, Math.log(2))));
+    setSize = getNearestSetSize(2 * setSize);
+    int optimalK = (int) Math.round(((double) setSize / maxNumUniqueValues) * Math.log(2));
+    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, optimalK);
   }
 
-  private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
+  private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) {
     super();
     this.filter = filter;
     this.bloomSize = bloomSize;
-    this.hashFunction = hashFunction;
+    this.hashFunction = MurmurHash64.INSTANCE;
+    this.hashCount = hashCount;
   }
 
   /**
    * The main method required for a Bloom filter which, given a value determines set membership.
-   * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
+   * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. Hash
+   * generation follows the same principles as {@link #addValue(BytesRef)}
    *
    * @return NO or MAYBE
    */
   public ContainsResult contains(BytesRef value) {
-    int hash = hashFunction.hash(value);
-    if (hash < 0) {
-      hash = hash * -1;
+    long hash = hashFunction.hash(value);
+    int msb = (int) (hash >>> Integer.SIZE);
+    int lsb = (int) hash;
+    for (int i = 0; i < hashCount; i++) {
+      int bloomPos = (lsb + i * msb);
+      if (!mayContainValue(bloomPos)) {
+        return ContainsResult.NO;
+      }
     }
-    return mayContainValue(hash);
+    return ContainsResult.MAYBE;
   }
 
   /**
    * Serializes the data set to file using the following format:
    *
    * <ul>
-   *   <li>FuzzySet --&gt;FuzzySetVersion,HashFunctionName,BloomSize,
-   *       NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
-   *   <li>HashFunctionName --&gt; {@link DataOutput#writeString(String) String} The name of a
-   *       ServiceProvider registered {@link HashFunction}
-   *   <li>FuzzySetVersion --&gt; {@link DataOutput#writeInt Uint32} The version number of the
-   *       {@link FuzzySet} class
+   *   <li>FuzzySet --&gt;hashCount,BloomSize, NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
+   *   <li>hashCount --&gt; {@link DataOutput#writeVInt Uint32} The number of hash functions (k).
    *   <li>BloomSize --&gt; {@link DataOutput#writeInt Uint32} The modulo value used to project
    *       hashes into the field's Bitset
    *   <li>NumBitSetWords --&gt; {@link DataOutput#writeInt Uint32} The number of longs (as returned
@@ -185,7 +180,7 @@ public class FuzzySet implements Accountable {
    * @throws IOException If there is a low-level I/O error
    */
   public void serialize(DataOutput out) throws IOException {
-    out.writeInt(VERSION_CURRENT);
+    out.writeVInt(hashCount);
     out.writeInt(bloomSize);
     long[] bits = filter.getBits();
     out.writeInt(bits.length);
@@ -197,11 +192,7 @@ public class FuzzySet implements Accountable {
   }
 
   public static FuzzySet deserialize(DataInput in) throws IOException {
-    int version = in.readInt();
-    if (version == VERSION_SPI) {
-      in.readString();
-    }
-    final HashFunction hashFunction = hashFunctionForVersion(version);
+    int hashCount = in.readVInt();
     int bloomSize = in.readInt();
     int numLongs = in.readInt();
     long[] longs = new long[numLongs];
@@ -209,36 +200,33 @@ public class FuzzySet implements Accountable {
       longs[i] = in.readLong();
     }
     FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
-    return new FuzzySet(bits, bloomSize, hashFunction);
+    return new FuzzySet(bits, bloomSize, hashCount);
   }
 
-  private ContainsResult mayContainValue(int positiveHash) {
-    assert positiveHash >= 0;
+  private boolean mayContainValue(int aHash) {
     // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
-    int pos = positiveHash & bloomSize;
-    if (filter.get(pos)) {
-      // This term may be recorded in this index (but could be a collision)
-      return ContainsResult.MAYBE;
-    }
-    // definitely NOT in this segment
-    return ContainsResult.NO;
+    int pos = aHash & bloomSize;
+    return filter.get(pos);
   }
 
   /**
-   * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
-   * chosen size of the internal bitset.
+   * Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
+   * two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
+   * https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf). Finally, each generated hash
+   * is modulo n'd where n is the chosen size of the internal bitset.
    *
    * @param value the key value to be hashed
    * @throws IOException If there is a low-level I/O error
    */
   public void addValue(BytesRef value) throws IOException {
-    int hash = hashFunction.hash(value);
-    if (hash < 0) {
-      hash = hash * -1;
+    long hash = hashFunction.hash(value);
+    int msb = (int) (hash >>> Integer.SIZE);
+    int lsb = (int) hash;
+    for (int i = 0; i < hashCount; i++) {
+      // Bitmasking using bloomSize is effectively a modulo operation.
+      int bloomPos = (lsb + i * msb) & bloomSize;
+      filter.set(bloomPos);
     }
-    // Bitmasking using bloomSize is effectively a modulo operation.
-    int bloomPos = hash & bloomSize;
-    filter.set(bloomPos);
   }
 
   /**
@@ -279,7 +267,7 @@ public class FuzzySet implements Accountable {
     } else {
       return null;
     }
-    return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashFunction);
+    return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashCount);
   }
 
   public int getEstimatedUniqueValues() {
@@ -297,6 +285,10 @@ public class FuzzySet implements Accountable {
     return (int) (setSizeAsDouble * logInverseSaturation);
   }
 
+  public float getTargetMaxSaturation() {
+    return 0.5f;
+  }
+
   public float getSaturation() {
     int numBitsSet = filter.cardinality();
     return (float) numBitsSet / (float) bloomSize;
@@ -309,6 +301,15 @@ public class FuzzySet implements Accountable {
 
   @Override
   public String toString() {
-    return getClass().getSimpleName() + "(hash=" + hashFunction + ")";
+    return getClass().getSimpleName()
+        + "(hash="
+        + hashFunction
+        + ", k="
+        + hashCount
+        + ", bits="
+        + filter.cardinality()
+        + "/"
+        + filter.length()
+        + ")";
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
index 5b22d121d2e..eac514a7bb8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
@@ -33,5 +33,5 @@ public abstract class HashFunction {
    * @param bytes the data to be hashed
    * @return the hash of the bytes referenced by bytes.offset and length bytes.length
    */
-  public abstract int hash(BytesRef bytes);
+  public abstract long hash(BytesRef bytes);
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java
deleted file mode 100644
index c3f5853f409..00000000000
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.bloom;
-
-import org.apache.lucene.util.BytesRef;
-
-/**
- * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
- * http://murmurhash.googlepages.com/ for more details.
- *
- * <p>The C version of MurmurHash 2.0 found at that site was ported to Java by Andrzej Bialecki (ab
- * at getopt org).
- *
- * <p>The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable
- * choice of hashing functions as the core function had to be adapted to work with BytesRefs with
- * offsets and lengths rather than raw byte arrays.
- *
- * @lucene.experimental
- */
-public final class MurmurHash2 extends HashFunction {
-
-  public static final MurmurHash2 INSTANCE = new MurmurHash2();
-
-  private MurmurHash2() {}
-
-  public static int hash(byte[] data, int seed, int offset, int len) {
-    int m = 0x5bd1e995;
-    int r = 24;
-    int h = seed ^ len;
-    int len_4 = len >> 2;
-    for (int i = 0; i < len_4; i++) {
-      int i_4 = offset + (i << 2);
-      int k = data[i_4 + 3];
-      k = k << 8;
-      k = k | (data[i_4 + 2] & 0xff);
-      k = k << 8;
-      k = k | (data[i_4 + 1] & 0xff);
-      k = k << 8;
-      k = k | (data[i_4 + 0] & 0xff);
-      k *= m;
-      k ^= k >>> r;
-      k *= m;
-      h *= m;
-      h ^= k;
-    }
-    int len_m = len_4 << 2;
-    int left = len - len_m;
-    if (left != 0) {
-      if (left >= 3) {
-        h ^= data[offset + len - 3] << 16;
-      }
-      if (left >= 2) {
-        h ^= data[offset + len - 2] << 8;
-      }
-      if (left >= 1) {
-        h ^= data[offset + len - 1];
-      }
-      h *= m;
-    }
-    h ^= h >>> 13;
-    h *= m;
-    h ^= h >>> 15;
-    return h;
-  }
-
-  /**
-   * Generates 32 bit hash from byte array with default seed value.
-   *
-   * @param data byte array to hash
-   * @param offset the start position in the array to hash
-   * @param len length of the array elements to hash
-   * @return 32 bit hash of the given array
-   */
-  public static final int hash32(final byte[] data, int offset, int len) {
-    return MurmurHash2.hash(data, 0x9747b28c, offset, len);
-  }
-
-  @Override
-  public final int hash(BytesRef br) {
-    return hash32(br.bytes, br.offset, br.length);
-  }
-
-  @Override
-  public String toString() {
-    return getClass().getSimpleName();
-  }
-}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java
new file mode 100644
index 00000000000..1d189773143
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.bloom;
+
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
+ * http://murmurhash.googlepages.com/ for more details.
+ *
+ * <p>The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets
+ * and lengths rather than raw byte arrays.
+ */
+public class MurmurHash64 extends HashFunction {
+  private static final long M64 = 0xc6a4a7935bd1e995L;
+  private static final int R64 = 47;
+  public static final HashFunction INSTANCE = new MurmurHash64();
+
+  /**
+   * Generates a 64-bit hash from byte array of the given length and seed.
+   *
+   * @param data The input byte array
+   * @param seed The initial seed value
+   * @param length The length of the array
+   * @return The 64-bit hash of the given array
+   */
+  public static long hash64(byte[] data, int seed, int offset, int length) {
+    long h = (seed & 0xffffffffL) ^ (length * M64);
+
+    final int nblocks = length >> 3;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+
+      long k = (long) BitUtil.VH_LE_LONG.get(data, offset);
+      k *= M64;
+      k ^= k >>> R64;
+      k *= M64;
+
+      h ^= k;
+      h *= M64;
+
+      offset += Long.BYTES;
+    }
+
+    int remaining = length & 0x07;
+    if (0 < remaining) {
+      for (int i = 0; i < remaining; i++) {
+        h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i);
+      }
+      h *= M64;
+    }
+
+    h ^= h >>> R64;
+    h *= M64;
+    h ^= h >>> R64;
+
+    return h;
+  }
+
+  @Override
+  public final long hash(BytesRef br) {
+    return hash64(br.bytes, 0xe17a1465, br.offset, br.length);
+  }
+
+  @Override
+  public String toString() {
+    return getClass().getSimpleName();
+  }
+}