You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/02/01 14:02:51 UTC
[lucene] 02/02: Reduce bloom filter size by using the optimal count for hash functions. (#11900)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit d1e4a3a44d8918262def84ae3363857772ea74b7
Author: Jean-François BOEUF <je...@gmail.com>
AuthorDate: Wed Feb 1 14:35:50 2023 +0100
Reduce bloom filter size by using the optimal count for hash functions. (#11900)
---
lucene/CHANGES.txt | 5 +-
.../lucene/codecs/bloom/BloomFilterFactory.java | 4 +-
.../codecs/bloom/BloomFilteringPostingsFormat.java | 13 ++-
.../codecs/bloom/DefaultBloomFilterFactory.java | 2 +-
.../org/apache/lucene/codecs/bloom/FuzzySet.java | 129 +++++++++++----------
.../apache/lucene/codecs/bloom/HashFunction.java | 2 +-
.../apache/lucene/codecs/bloom/MurmurHash2.java | 101 ----------------
.../apache/lucene/codecs/bloom/MurmurHash64.java | 85 ++++++++++++++
8 files changed, 166 insertions(+), 175 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4d7f38d7678..c763b467d18 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -19,7 +19,10 @@ Improvements
Optimizations
---------------------
-(No changes)
+
+* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions
+ in order to achieve the same false positive probability with less memory.
+ (Jean-François Boeuf)
Bug Fixes
---------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
index 69f31f19585..910f59a5ab9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
@@ -42,9 +42,7 @@ public abstract class BloomFilterFactory {
* @return null or a hopefully more densely packed, smaller bitset
*/
public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
- // Aim for a bitset size that would have 10% of bits set (so 90% of searches
- // would fail-fast)
- float targetMaxSaturation = 0.1f;
+ float targetMaxSaturation = initialSet.getTargetMaxSaturation();
return initialSet.downsize(targetMaxSaturation);
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
index 52a2e838261..7e5bf64fbbe 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
@@ -53,7 +53,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
*
* <p>A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a
* per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a
- * ~8mb bitset and hashes values using {@link MurmurHash2}. This should be suitable for most
+ * ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most
* purposes.
*
* <p>The format of the blm file is as follows:
@@ -83,8 +83,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
/** Extension of Bloom Filters file */
static final String BLOOM_EXTENSION = "blm";
- BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
- private PostingsFormat delegatePostingsFormat;
+ private final BloomFilterFactory bloomFilterFactory;
+ private final PostingsFormat delegatePostingsFormat;
/**
* Creates Bloom filters for a selection of fields created in the index. This is recorded as a set
@@ -120,7 +120,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
// Used only by core Lucene at read-time via Service Provider instantiation -
// do not use at Write-time in application code.
public BloomFilteringPostingsFormat() {
- super(BLOOM_CODEC_NAME);
+ this(null, new DefaultBloomFilterFactory());
}
@Override
@@ -366,6 +366,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
public ImpactsEnum impacts(int flags) throws IOException {
return delegate().impacts(flags);
}
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(filter=" + filter.toString() + ")";
+ }
}
@Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
index 61e60dcbca1..306fdb9b4f9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
@@ -31,7 +31,7 @@ public class DefaultBloomFilterFactory extends BloomFilterFactory {
public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
// Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set
// with 10% of bits set
- return FuzzySet.createSetBasedOnQuality(state.segmentInfo.maxDoc(), 0.10f);
+ return FuzzySet.createOptimalSet(state.segmentInfo.maxDoc(), 0.1023f);
}
@Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
index 5ccb04203da..f1d2dee65c7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java
@@ -44,21 +44,6 @@ import org.apache.lucene.util.RamUsageEstimator;
*/
public class FuzzySet implements Accountable {
- public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI
- public static final int VERSION_START = VERSION_SPI;
- public static final int VERSION_CURRENT = 2;
-
- public static HashFunction hashFunctionForVersion(int version) {
- if (version < VERSION_START) {
- throw new IllegalArgumentException(
- "Version " + version + " is too old, expected at least " + VERSION_START);
- } else if (version > VERSION_CURRENT) {
- throw new IllegalArgumentException(
- "Version " + version + " is too new, expected at most " + VERSION_CURRENT);
- }
- return MurmurHash2.INSTANCE;
- }
-
/**
* Result from {@link FuzzySet#contains(BytesRef)}: can never return definitively YES (always
* MAYBE), but can sometimes definitely return NO.
@@ -71,6 +56,7 @@ public class FuzzySet implements Accountable {
private HashFunction hashFunction;
private FixedBitSet filter;
private int bloomSize;
+ private final int hashCount;
// The sizes of BitSet used are all numbers that, when expressed in binary form,
// are all ones. This is to enable fast downsizing from one bitset to another
@@ -82,12 +68,9 @@ public class FuzzySet implements Accountable {
static final int[] usableBitSetSizes;
static {
- usableBitSetSizes = new int[30];
- int mask = 1;
- int size = mask;
+ usableBitSetSizes = new int[26];
for (int i = 0; i < usableBitSetSizes.length; i++) {
- size = (size << 1) | mask;
- usableBitSetSizes[i] = size;
+ usableBitSetSizes[i] = (1 << (i + 6)) - 1;
}
}
@@ -131,48 +114,60 @@ public class FuzzySet implements Accountable {
public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) {
int setSize = getNearestSetSize(maxNumBytes);
- return new FuzzySet(
- new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+ return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
}
public static FuzzySet createSetBasedOnQuality(
- int maxNumUniqueValues, float desiredMaxSaturation) {
+ int maxNumUniqueValues, float desiredMaxSaturation, int version) {
int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
- return new FuzzySet(
- new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+ return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
+ }
+
+ public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) {
+ int setSize =
+ (int)
+ Math.ceil(
+ (maxNumUniqueValues * Math.log(targetMaxFpp))
+ / Math.log(1 / Math.pow(2, Math.log(2))));
+ setSize = getNearestSetSize(2 * setSize);
+ int optimalK = (int) Math.round(((double) setSize / maxNumUniqueValues) * Math.log(2));
+ return new FuzzySet(new FixedBitSet(setSize + 1), setSize, optimalK);
}
- private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
+ private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) {
super();
this.filter = filter;
this.bloomSize = bloomSize;
- this.hashFunction = hashFunction;
+ this.hashFunction = MurmurHash64.INSTANCE;
+ this.hashCount = hashCount;
}
/**
* The main method required for a Bloom filter which, given a value determines set membership.
- * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
+ * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. Hash
+ * generation follows the same principles as {@link #addValue(BytesRef)}
*
* @return NO or MAYBE
*/
public ContainsResult contains(BytesRef value) {
- int hash = hashFunction.hash(value);
- if (hash < 0) {
- hash = hash * -1;
+ long hash = hashFunction.hash(value);
+ int msb = (int) (hash >>> Integer.SIZE);
+ int lsb = (int) hash;
+ for (int i = 0; i < hashCount; i++) {
+ int bloomPos = (lsb + i * msb);
+ if (!mayContainValue(bloomPos)) {
+ return ContainsResult.NO;
+ }
}
- return mayContainValue(hash);
+ return ContainsResult.MAYBE;
}
/**
* Serializes the data set to file using the following format:
*
* <ul>
- * <li>FuzzySet -->FuzzySetVersion,HashFunctionName,BloomSize,
- * NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
- * <li>HashFunctionName --> {@link DataOutput#writeString(String) String} The name of a
- * ServiceProvider registered {@link HashFunction}
- * <li>FuzzySetVersion --> {@link DataOutput#writeInt Uint32} The version number of the
- * {@link FuzzySet} class
+ * <li>FuzzySet -->hashCount,BloomSize, NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup>
+ * <li>hashCount --> {@link DataOutput#writeVInt Uint32} The number of hash functions (k).
* <li>BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used to project
* hashes into the field's Bitset
* <li>NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of longs (as returned
@@ -185,7 +180,7 @@ public class FuzzySet implements Accountable {
* @throws IOException If there is a low-level I/O error
*/
public void serialize(DataOutput out) throws IOException {
- out.writeInt(VERSION_CURRENT);
+ out.writeVInt(hashCount);
out.writeInt(bloomSize);
long[] bits = filter.getBits();
out.writeInt(bits.length);
@@ -197,11 +192,7 @@ public class FuzzySet implements Accountable {
}
public static FuzzySet deserialize(DataInput in) throws IOException {
- int version = in.readInt();
- if (version == VERSION_SPI) {
- in.readString();
- }
- final HashFunction hashFunction = hashFunctionForVersion(version);
+ int hashCount = in.readVInt();
int bloomSize = in.readInt();
int numLongs = in.readInt();
long[] longs = new long[numLongs];
@@ -209,36 +200,33 @@ public class FuzzySet implements Accountable {
longs[i] = in.readLong();
}
FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
- return new FuzzySet(bits, bloomSize, hashFunction);
+ return new FuzzySet(bits, bloomSize, hashCount);
}
- private ContainsResult mayContainValue(int positiveHash) {
- assert positiveHash >= 0;
+ private boolean mayContainValue(int aHash) {
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
- int pos = positiveHash & bloomSize;
- if (filter.get(pos)) {
- // This term may be recorded in this index (but could be a collision)
- return ContainsResult.MAYBE;
- }
- // definitely NOT in this segment
- return ContainsResult.NO;
+ int pos = aHash & bloomSize;
+ return filter.get(pos);
}
/**
- * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
- * chosen size of the internal bitset.
+ * Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
+ * two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
+ * https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf). Finally, each generated hash
+ * is modulo n'd where n is the chosen size of the internal bitset.
*
* @param value the key value to be hashed
* @throws IOException If there is a low-level I/O error
*/
public void addValue(BytesRef value) throws IOException {
- int hash = hashFunction.hash(value);
- if (hash < 0) {
- hash = hash * -1;
+ long hash = hashFunction.hash(value);
+ int msb = (int) (hash >>> Integer.SIZE);
+ int lsb = (int) hash;
+ for (int i = 0; i < hashCount; i++) {
+ // Bitmasking using bloomSize is effectively a modulo operation.
+ int bloomPos = (lsb + i * msb) & bloomSize;
+ filter.set(bloomPos);
}
- // Bitmasking using bloomSize is effectively a modulo operation.
- int bloomPos = hash & bloomSize;
- filter.set(bloomPos);
}
/**
@@ -279,7 +267,7 @@ public class FuzzySet implements Accountable {
} else {
return null;
}
- return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashFunction);
+ return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashCount);
}
public int getEstimatedUniqueValues() {
@@ -297,6 +285,10 @@ public class FuzzySet implements Accountable {
return (int) (setSizeAsDouble * logInverseSaturation);
}
+ public float getTargetMaxSaturation() {
+ return 0.5f;
+ }
+
public float getSaturation() {
int numBitsSet = filter.cardinality();
return (float) numBitsSet / (float) bloomSize;
@@ -309,6 +301,15 @@ public class FuzzySet implements Accountable {
@Override
public String toString() {
- return getClass().getSimpleName() + "(hash=" + hashFunction + ")";
+ return getClass().getSimpleName()
+ + "(hash="
+ + hashFunction
+ + ", k="
+ + hashCount
+ + ", bits="
+ + filter.cardinality()
+ + "/"
+ + filter.length()
+ + ")";
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
index 5b22d121d2e..eac514a7bb8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java
@@ -33,5 +33,5 @@ public abstract class HashFunction {
* @param bytes the data to be hashed
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
*/
- public abstract int hash(BytesRef bytes);
+ public abstract long hash(BytesRef bytes);
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java
deleted file mode 100644
index c3f5853f409..00000000000
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash2.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.bloom;
-
-import org.apache.lucene.util.BytesRef;
-
-/**
- * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
- * http://murmurhash.googlepages.com/ for more details.
- *
- * <p>The C version of MurmurHash 2.0 found at that site was ported to Java by Andrzej Bialecki (ab
- * at getopt org).
- *
- * <p>The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable
- * choice of hashing functions as the core function had to be adapted to work with BytesRefs with
- * offsets and lengths rather than raw byte arrays.
- *
- * @lucene.experimental
- */
-public final class MurmurHash2 extends HashFunction {
-
- public static final MurmurHash2 INSTANCE = new MurmurHash2();
-
- private MurmurHash2() {}
-
- public static int hash(byte[] data, int seed, int offset, int len) {
- int m = 0x5bd1e995;
- int r = 24;
- int h = seed ^ len;
- int len_4 = len >> 2;
- for (int i = 0; i < len_4; i++) {
- int i_4 = offset + (i << 2);
- int k = data[i_4 + 3];
- k = k << 8;
- k = k | (data[i_4 + 2] & 0xff);
- k = k << 8;
- k = k | (data[i_4 + 1] & 0xff);
- k = k << 8;
- k = k | (data[i_4 + 0] & 0xff);
- k *= m;
- k ^= k >>> r;
- k *= m;
- h *= m;
- h ^= k;
- }
- int len_m = len_4 << 2;
- int left = len - len_m;
- if (left != 0) {
- if (left >= 3) {
- h ^= data[offset + len - 3] << 16;
- }
- if (left >= 2) {
- h ^= data[offset + len - 2] << 8;
- }
- if (left >= 1) {
- h ^= data[offset + len - 1];
- }
- h *= m;
- }
- h ^= h >>> 13;
- h *= m;
- h ^= h >>> 15;
- return h;
- }
-
- /**
- * Generates 32 bit hash from byte array with default seed value.
- *
- * @param data byte array to hash
- * @param offset the start position in the array to hash
- * @param len length of the array elements to hash
- * @return 32 bit hash of the given array
- */
- public static final int hash32(final byte[] data, int offset, int len) {
- return MurmurHash2.hash(data, 0x9747b28c, offset, len);
- }
-
- @Override
- public final int hash(BytesRef br) {
- return hash32(br.bytes, br.offset, br.length);
- }
-
- @Override
- public String toString() {
- return getClass().getSimpleName();
- }
-}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java
new file mode 100644
index 00000000000..1d189773143
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.bloom;
+
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
+ * http://murmurhash.googlepages.com/ for more details.
+ *
+ * <p>The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets
+ * and lengths rather than raw byte arrays.
+ */
+public class MurmurHash64 extends HashFunction {
+ private static final long M64 = 0xc6a4a7935bd1e995L;
+ private static final int R64 = 47;
+ public static final HashFunction INSTANCE = new MurmurHash64();
+
+ /**
+ * Generates a 64-bit hash from byte array of the given length and seed.
+ *
+ * @param data The input byte array
+ * @param seed The initial seed value
+ * @param length The length of the array
+ * @return The 64-bit hash of the given array
+ */
+ public static long hash64(byte[] data, int seed, int offset, int length) {
+ long h = (seed & 0xffffffffL) ^ (length * M64);
+
+ final int nblocks = length >> 3;
+
+ // body
+ for (int i = 0; i < nblocks; i++) {
+
+ long k = (long) BitUtil.VH_LE_LONG.get(data, offset);
+ k *= M64;
+ k ^= k >>> R64;
+ k *= M64;
+
+ h ^= k;
+ h *= M64;
+
+ offset += Long.BYTES;
+ }
+
+ int remaining = length & 0x07;
+ if (0 < remaining) {
+ for (int i = 0; i < remaining; i++) {
+ h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i);
+ }
+ h *= M64;
+ }
+
+ h ^= h >>> R64;
+ h *= M64;
+ h ^= h >>> R64;
+
+ return h;
+ }
+
+ @Override
+ public final long hash(BytesRef br) {
+ return hash64(br.bytes, 0xe17a1465, br.offset, br.length);
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+}