You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2020/03/10 21:41:53 UTC

[kudu] 02/03: [util] Import "Or" function to BlockBloomFilter from Impala

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 885d7946c771ea875919d479c67b2ddac5c5103e
Author: Bankim Bhavsar <ba...@cloudera.com>
AuthorDate: Tue Mar 3 16:18:17 2020 -0800

    [util] Import "Or" function to BlockBloomFilter from Impala
    
    Impala will be switching to using the Block Bloom filter from kudu-util.
    "Or" function was missing and this change adds it.
    
    Note that original implementation for OrEqualArrayAvx() in Impala is
    targeted for AVX and not AVX2, however AVX2 is super-set of AVX instructions
    and there is already provision in the Block Bloom filter to separate out
    AVX2 v/s non-AVX2 (SSE) code. Hence don't see need to add separate AVX
    specific file/implementation.
    
    Change-Id: Ibe5f9311f73dcff883dd2cce18fd558e7d57d14f
    Reviewed-on: http://gerrit.cloudera.org:8080/15373
    Tested-by: Kudu Jenkins
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/util/block_bloom_filter-test.cc | 33 +++++++++++++++++++
 src/kudu/util/block_bloom_filter.cc      | 56 ++++++++++++++++++++++++++++++++
 src/kudu/util/block_bloom_filter.h       | 28 ++++++++++++++--
 src/kudu/util/block_bloom_filter_avx2.cc | 20 +++++++++++-
 4 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/src/kudu/util/block_bloom_filter-test.cc b/src/kudu/util/block_bloom_filter-test.cc
index 4c4d418..b9ed9f2 100644
--- a/src/kudu/util/block_bloom_filter-test.cc
+++ b/src/kudu/util/block_bloom_filter-test.cc
@@ -286,4 +286,37 @@ TEST_F(BlockBloomFilterTest, MinSpaceForFpp) {
     }
   }
 }
+
+TEST_F(BlockBloomFilterTest, Or) {
+  BlockBloomFilter* bf1 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
+  BlockBloomFilter* bf2 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
+
+  for (int i = 60; i < 80; ++i) bf2->Insert(i);
+  for (int i = 0; i < 10; ++i) bf1->Insert(i);
+
+  ASSERT_OK(bf1->Or(*bf2));
+  for (int i = 0; i < 10; ++i) ASSERT_TRUE(bf1->Find(i)) << i;
+  for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf1->Find(i)) << i;
+
+  // Insert another value to aggregated BloomFilter.
+  for (int i = 11; i < 50; ++i) bf1->Insert(i);
+
+  for (int i = 11; i < 50; ++i) ASSERT_TRUE(bf1->Find(i)) << i;
+  ASSERT_FALSE(bf1->Find(81));
+
+  // Check that AlwaysFalse() is updated correctly.
+  BlockBloomFilter* bf3 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
+  BlockBloomFilter* always_false = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
+  ASSERT_OK(bf3->Or(*always_false));
+  EXPECT_TRUE(bf3->AlwaysFalse());
+  ASSERT_OK(bf3->Or(*bf2));
+  EXPECT_FALSE(bf3->AlwaysFalse());
+
+  // Invalid argument test cases.
+  BlockBloomFilter* bf4 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
+  BlockBloomFilter* bf5 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100000, 0.01));
+  Status s = bf4->Or(*bf5);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_STR_CONTAINS(s.ToString(), "Directory size don't match");
+}
 }  // namespace kudu
diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc
index e09a7cc..acaf358 100644
--- a/src/kudu/util/block_bloom_filter.cc
+++ b/src/kudu/util/block_bloom_filter.cc
@@ -47,6 +47,9 @@ namespace kudu {
 
 constexpr uint32_t BlockBloomFilter::kRehash[8] __attribute__((aligned(32)));
 const base::CPU BlockBloomFilter::kCpu = base::CPU();
+// constexpr data member requires initialization in the class declaration.
+// Hence no duplicate initialization in the definition here.
+constexpr BlockBloomFilter* const BlockBloomFilter::kAlwaysTrueFilter;
 
 BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_allocator) :
   always_false_(true),
@@ -60,13 +63,16 @@ BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_all
   if (has_avx2()) {
     bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsertAVX2;
     bucket_find_func_ptr_ = &BlockBloomFilter::BucketFindAVX2;
+    or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArrayAVX2;
   } else {
     bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert;
     bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind;
+    or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray;
   }
 #else
   bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert;
   bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind;
+  or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray;
 #endif
 }
 
@@ -259,6 +265,56 @@ bool BlockBloomFilter::operator!=(const BlockBloomFilter& rhs) const {
   return !(rhs == *this);
 }
 
+void BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in,
+                                    uint8_t* __restrict__ out) {
+  // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
+  // written in a way that is very friendly to auto-vectorization. Instead, we manually
+  // vectorize, increasing the speed by up to 56x.
+  const __m128i* simd_in = reinterpret_cast<const __m128i*>(in);
+  const __m128i* const simd_in_end = reinterpret_cast<const __m128i*>(in + n);
+  __m128i* simd_out = reinterpret_cast<__m128i*>(out);
+  // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i)
+  // == 16, we can do two _mm_or_si128's in each iteration without checking array
+  // bounds.
+  while (simd_in != simd_in_end) {
+    for (int i = 0; i < 2; ++i, ++simd_in, ++simd_out) {
+      _mm_storeu_si128(
+          simd_out, _mm_or_si128(_mm_loadu_si128(simd_out), _mm_loadu_si128(simd_in)));
+    }
+  }
+}
+
+Status BlockBloomFilter::Or(const BlockBloomFilter& other) {
+  // AlwaysTrueFilter is a special case implemented with a nullptr.
+  // Hence Or'ing with an AlwaysTrueFilter will result in a Bloom filter that also
+  // always returns true which'll require destructing this Bloom filter.
+  // Moreover for a reference "other" to be an AlwaysTrueFilter the reference needs
+  // to be created from a nullptr and so we get into undefined behavior territory.
+  // Comparing AlwaysTrueFilter with "&other" results in a compiler warning for
+  // comparing a non-null argument "other" with NULL [-Wnonnull-compare].
+  // For above reasons, guard against it.
+  CHECK_NE(kAlwaysTrueFilter, &other);
+
+  if (this == &other) {
+    // No op.
+    return Status::OK();
+  }
+  if (directory_size() != other.directory_size()) {
+    return Status::InvalidArgument(Substitute("Directory size don't match. this: $0, other: $1",
+        directory_size(), other.directory_size()));
+  }
+  if (other.AlwaysFalse()) {
+    // Nothing to do.
+    return Status::OK();
+  }
+
+  (*or_equal_array_func_ptr_)(directory_size(),
+                              reinterpret_cast<uint8_t*>(other.directory_),
+                              reinterpret_cast<uint8_t*>(directory_));
+  always_false_ = false;
+  return Status::OK();
+}
+
 shared_ptr<DefaultBlockBloomFilterBufferAllocator>
     DefaultBlockBloomFilterBufferAllocator::GetSingletonSharedPtr() {
   // Meyer's Singleton.
diff --git a/src/kudu/util/block_bloom_filter.h b/src/kudu/util/block_bloom_filter.h
index a218dec..8a0fdf8 100644
--- a/src/kudu/util/block_bloom_filter.h
+++ b/src/kudu/util/block_bloom_filter.h
@@ -148,6 +148,21 @@ class BlockBloomFilter {
   bool operator==(const BlockBloomFilter& rhs) const;
   bool operator!=(const BlockBloomFilter& rhs) const;
 
+  // Computes the logical OR of this filter with 'other' and stores the result in this
+  // filter.
+  // Notes:
+  // - The directory sizes of the Bloom filters must match.
+  // - Or'ing with kAlwaysTrueFilter is disallowed.
+  Status Or(const BlockBloomFilter& other);
+
+  // Returns whether the Bloom filter is empty and hence would return false for all lookups.
+  bool AlwaysFalse() const {
+    return always_false_;
+  }
+
+  // Representation of a filter which allows all elements to pass.
+  static constexpr BlockBloomFilter* const kAlwaysTrueFilter = nullptr;
+
  private:
   // always_false_ is true when the bloom filter hasn't had any elements inserted.
   bool always_false_;
@@ -190,7 +205,7 @@ class BlockBloomFilter {
   // Helper function for public Init() variants.
   Status InitInternal(int log_space_bytes, HashAlgorithm hash_algorithm, uint32_t hash_seed);
 
-  // Same as Insert(), but skips the CPU check and assumes that AVX is not available.
+  // Same as Insert(), but skips the CPU check and assumes that AVX2 is not available.
   void InsertNoAvx2(uint32_t hash) noexcept;
 
   // Does the actual work of Insert(). bucket_idx is the index of the bucket to insert
@@ -199,8 +214,11 @@ class BlockBloomFilter {
 
   bool BucketFind(uint32_t bucket_idx, uint32_t hash) const noexcept;
 
+  // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n'.
+  static void OrEqualArray(size_t n, const uint8_t* __restrict__ in, uint8_t* __restrict__ out);
+
 #ifdef USE_AVX2
-  // Same as Insert(), but skips the CPU check and assumes that AVX is available.
+  // Same as Insert(), but skips the CPU check and assumes that AVX2 is available.
   void InsertAvx2(uint32_t hash) noexcept __attribute__((__target__("avx2")));
 
   // A faster SIMD version of BucketInsert().
@@ -210,12 +228,18 @@ class BlockBloomFilter {
   // A faster SIMD version of BucketFind().
   bool BucketFindAVX2(uint32_t bucket_idx, uint32_t hash) const noexcept
       __attribute__((__target__("avx2")));
+
+  // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX2
+  // instructions. 'n' must be a multiple of 32.
+  static void OrEqualArrayAVX2(size_t n, const uint8_t* __restrict__ in,
+                               uint8_t* __restrict__ out) __attribute__((target("avx2")));
 #endif
 
   // Function pointers initialized in constructor to avoid run-time cost
   // in hot-path of Find and Insert operations.
   decltype(&BlockBloomFilter::BucketInsert) bucket_insert_func_ptr_;
   decltype(&BlockBloomFilter::BucketFind) bucket_find_func_ptr_;
+  decltype(&BlockBloomFilter::OrEqualArray) or_equal_array_func_ptr_;
 
   // Returns amount of space used in log2 bytes.
   int log_space_bytes() const {
diff --git a/src/kudu/util/block_bloom_filter_avx2.cc b/src/kudu/util/block_bloom_filter_avx2.cc
index e10b6cc..93c3ff6 100644
--- a/src/kudu/util/block_bloom_filter_avx2.cc
+++ b/src/kudu/util/block_bloom_filter_avx2.cc
@@ -24,9 +24,14 @@
 
 #include "kudu/util/block_bloom_filter.h"
 
-#include <cstdint>
 #include <immintrin.h>
 
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+
+#include <glog/logging.h>
+
 #include "kudu/gutil/port.h"
 
 namespace kudu {
@@ -75,4 +80,17 @@ void BlockBloomFilter::InsertAvx2(const uint32_t hash) noexcept {
   BucketInsertAVX2(bucket_idx, hash);
 }
 
+void BlockBloomFilter::OrEqualArrayAVX2(size_t n, const uint8_t* __restrict__ in,
+                                        uint8_t* __restrict__ out) {
+  constexpr size_t kAVXRegisterBytes = sizeof(__m256d);
+  DCHECK_EQ(n % kAVXRegisterBytes, 0) << "Invalid Bloom filter directory size";
+  const uint8_t* const in_end = in + n;
+  for (; in != in_end; (in += kAVXRegisterBytes), (out += kAVXRegisterBytes)) {
+    const double* double_in = reinterpret_cast<const double*>(in);
+    double* double_out = reinterpret_cast<double*>(out);
+    _mm256_storeu_pd(double_out,
+                     _mm256_or_pd(_mm256_loadu_pd(double_out), _mm256_loadu_pd(double_in)));
+  }
+}
+
 } // namespace kudu