You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/08/12 03:15:07 UTC

[impala] 04/04: IMPALA-11468: Port "Block Bloom filter false positive correction" from Kudu

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 15ce822e77325969cf75e198979c55acd671ef19
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Fri Jul 29 15:56:21 2022 -0700

    IMPALA-11468: Port "Block Bloom filter false positive correction" from Kudu
    
    Block Bloom filters have a higher false positive rate than standard
    Bloom filter, due to the uneven distribution of keys between
    buckets. This patch changes the code to match the theory, using an
    approximation from the paper that introduced block Bloom filters,
    "Cache-, Hash- and Space-Efficient Bloom Filters" by Putze et al.
    
    In scan_predicate.cc, filters are created with
    BlockBloomFilter::MinLogSpace. Prior to this patch, that method will
    sometimes return a value that is lower than the true answer, leading
    to smaller filters and higher false positive probabilities than
    expected. This patch corrects BlockBloomFilter::MinLogSpace, leading
    to filters having the expected false positive rate by dint of their
    larger size. The performance impact here is dependent on the extent
    than a scan is bottlenecked by heap space for the filter vs. compute
    time for the scan predicate application to filter false positives.
    
    For a false positive probability of 1%, as is currently set in
    scan_predicate.cc, this patch leads to an increase in filter size of
    about 10% and a decrease in filter false positive probability by
    50%. However, this is obscured by the coarseness of the fact that
    filters are constrained to have a size in bytes that is a power of
    two. Loosening that restriction is potential future work.
    
    Porting Notes:
     - The MaxNdv() function is not present in Impala, so it is omitted.
     - This resolves a test failure for ParquetBloomFilter.FindInvalid
       when building with GCC 10 and the associated libstdc++.
     - This adds a comment noting that the test is also dependent
       on the libstdc++ implementation of unordered_set.
    
    Change-Id: Ic992e47976274e3ef0db3633d38e5a8e886274b4
    Reviewed-on: http://gerrit.cloudera.org:8080/18807
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 be/src/util/parquet-bloom-filter-test.cc | 33 +++++++++++-----
 be/src/util/parquet-bloom-filter.cc      | 65 +++++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/be/src/util/parquet-bloom-filter-test.cc b/be/src/util/parquet-bloom-filter-test.cc
index 7ae3449fb..014b3fe41 100644
--- a/be/src/util/parquet-bloom-filter-test.cc
+++ b/be/src/util/parquet-bloom-filter-test.cc
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "common/names.h"
+#include "kudu/util/random.h"
 #include "testutil/gtest-util.h"
 #include "util/parquet-bloom-filter.h"
 
@@ -148,23 +149,32 @@ TEST(ParquetBloomFilter, CumulativeFind) {
 // The empirical false positives we find when looking for random items is with a constant
 // factor of the false positive probability the Bloom filter was constructed for.
 TEST(ParquetBloomFilter, FindInvalid) {
-  srand(0);
-  static const int find_limit = 1 << 20;
+  // We use a deterministic pseudorandom number generator with a set seed. The reason is
+  // that with a run-dependent seed, there will always be inputs that can fail. That's a
+  // potential argument for this to be a benchmark rather than a test, although the
+  // measured quantity would be not time but deviation from predicted fpp.
+  kudu::Random rgen(867 + 5309);
+  static const int find_limit = 1 << 22;
   std::unordered_set<uint64_t> to_find;
   while (to_find.size() < find_limit) {
-    to_find.insert(MakeRand());
+    to_find.insert(rgen.Next64());
   }
   static const int max_log_ndv = 19;
   std::unordered_set<uint64_t> to_insert;
   while (to_insert.size() < (1ull << max_log_ndv)) {
-    const uint64_t candidate = MakeRand();
+    const uint64_t candidate = rgen.Next64();
     if (to_find.find(candidate) == to_find.end()) {
       to_insert.insert(candidate);
     }
   }
+  // NOTE: Even though this was generated with a deterministic pseudorandom number
+  // generator, the order of items in the vector is dependent on the implementation
+  // of unordered_set, which can change with different libstdc++ versions. Since
+  // the tests below use the first X entries, changes in the order also change the
+  // test.
   vector<uint64_t> shuffled_insert(to_insert.begin(), to_insert.end());
   for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) {
-    for (int log_fpp = 4; log_fpp < 15; ++log_fpp) {
+    for (int log_fpp = 4; log_fpp < 12; ++log_fpp) {
       double fpp = 1.0 / (1 << log_fpp);
       const size_t ndv = 1 << log_ndv;
       BloomWrapper wrapper = CreateBloomFilter(ndv, fpp);
@@ -182,18 +192,21 @@ TEST(ParquetBloomFilter, FindInvalid) {
       }
       EXPECT_LE(found, find_limit * fpp * 2)
           << "Too many false positives with -log2(fpp) = " << log_fpp
-          << ", ndv = " << ndv;
+          << " and ndv = " << ndv;
 
       // Because the space is rounded up to a power of 2, we might actually get a lower
       // fpp than the one passed to MinLogSpace().
       const int log_space = log2(wrapper.storage->size());
       const double expected_fpp =
           ParquetBloomFilter::FalsePositiveProb(ndv, log_space);
-      EXPECT_GE(found, find_limit * expected_fpp)
-          << "Too few false positives with -log2(fpp) = " << log_fpp << ", ndv = " << ndv;
-      EXPECT_LE(found, find_limit * expected_fpp * 8)
+      // Fudge factors are present because filter characteristics are true in the limit,
+      // and will deviate for small samples.
+      EXPECT_GE(found, find_limit * expected_fpp * 0.75)
+          << "Too few false positives with -log2(fpp) = " << log_fpp
+          << " expected_fpp = " << expected_fpp;
+      EXPECT_LE(found, find_limit * expected_fpp * 1.25)
           << "Too many false positives with -log2(fpp) = " << log_fpp
-          << ", ndv = " << ndv;
+          << " expected_fpp = " << expected_fpp;
     }
   }
 }
diff --git a/be/src/util/parquet-bloom-filter.cc b/be/src/util/parquet-bloom-filter.cc
index d1f9366e4..3b7d7b892 100644
--- a/be/src/util/parquet-bloom-filter.cc
+++ b/be/src/util/parquet-bloom-filter.cc
@@ -31,6 +31,7 @@
   #include <mm_malloc.h>
 #endif
 
+#include <climits>
 #include <cmath>
 #include <cstdint>
 
@@ -146,21 +147,59 @@ int ParquetBloomFilter::OptimalByteSize(const size_t ndv, const double fpp) {
   return min_space;
 }
 
-int ParquetBloomFilter::MinLogSpace(const size_t ndv, const double fpp) {
-  static const double k = kBucketWords;
-  if (0 == ndv) return 0;
-  // m is the number of bits we would need to get the fpp specified
-  const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k));
-
-  // Handle case where ndv == 1 => ceil(log2(m/8)) < 0.
-  return std::max(0, static_cast<int>(ceil(log2(m / 8))));
+// This implements the false positive probability in Putze et al.'s "Cache-, hash-and
+// space-efficient bloom filters", equation 3.
+double ParquetBloomFilter::FalsePositiveProb(const size_t ndv, const int log_space_bytes) {
+  static constexpr double kWordBits = 1 << kLogBucketWordBits;
+  const double bytes = static_cast<double>(1L << log_space_bytes);
+  if (ndv == 0) return 0.0;
+  if (bytes <= 0) return 1.0;
+  // This short-cuts a slowly-converging sum for very dense filters
+  if (ndv / (bytes * CHAR_BIT) > 2) return 1.0;
+
+  double result = 0;
+  // lam is the usual parameter to the Poisson's PMF. Following the notation in the paper,
+  // lam is B/c, where B is the number of bits in a bucket and c is the number of bits per
+  // distinct value
+  const double lam = kBucketWords * kWordBits / ((bytes * CHAR_BIT) / ndv);
+  // Some of the calculations are done in log-space to increase numerical stability
+  const double loglam = log(lam);
+
+  // 750 iterations are sufficient to cause the sum to converge in all of the tests. In
+  // other words, setting the iterations higher than 750 will give the same result as
+  // leaving it at 750.
+  static constexpr uint64_t kBloomFppIters = 750;
+  for (uint64_t j = 0; j < kBloomFppIters; ++j) {
+    // We start with the highest value of i, since the values we're adding to result are
+    // mostly smaller at high i, and this increases accuracy to sum from the smallest
+    // values up.
+    const double i = static_cast<double>(kBloomFppIters - 1 - j);
+    // The PMF of the Poisson distribution is lam^i * exp(-lam) / i!. In logspace, using
+    // lgamma for the log of the factorial function:
+    double logp = i * loglam - lam - lgamma(i + 1);
+    // The f_inner part of the equation in the paper is the probability of a single
+    // collision in the bucket. Since there are kBucketWords non-overlapping lanes in each
+    // bucket, the log of this probability is:
+    const double logfinner = kBucketWords * log(1.0 - pow(1.0 - 1.0 / kWordBits, i));
+    // Here we are forced out of log-space calculations
+    result += exp(logp + logfinner);
+  }
+  return (result > 1.0) ? 1.0 : result;
 }
 
-double ParquetBloomFilter::FalsePositiveProb(const size_t ndv,
-    const int log_space_bytes) {
-  return pow(1 - exp((-1.0 * static_cast<double>(kBucketWords) * static_cast<double>(ndv))
-                     / static_cast<double>(1ULL << (log_space_bytes + 3))),
-             kBucketWords);
+int ParquetBloomFilter::MinLogSpace(const size_t ndv, const double fpp) {
+  int low = 0;
+  int high = 64;
+  while (high > low + 1) {
+    int mid = (high + low) / 2;
+    const double candidate = FalsePositiveProb(ndv, mid);
+    if (candidate <= fpp) {
+      high = mid;
+    } else {
+      low = mid;
+    }
+  }
+  return high;
 }
 
 uint64_t ParquetBloomFilter::Hash(const uint8_t* input, size_t size) {