You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2020/03/14 09:21:03 UTC

[kudu] branch master updated: [util] Changes to allow serializing/de-serializing BlockBloomFilter to a custom format

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 1394746  [util] Changes to allow serializing/de-serializing BlockBloomFilter to a custom format
1394746 is described below

commit 1394746becb905ff460e0fcc615376d967bdc4c1
Author: Bankim Bhavsar <ba...@cloudera.com>
AuthorDate: Tue Mar 10 12:32:21 2020 -0700

    [util] Changes to allow serializing/de-serializing BlockBloomFilter to a custom format
    
    Consumers of the BlockBloomFilter like Impala would prefer using its own
    serialization/deserializing message format or library.
    
    Hence following set of changes:
    - InitFromDirectory() that allows initialization from a populated directory
      structure.
    - log_space_bytes() and directory() public functions to allow serialization.
    - Renamed AlwaysFalse() to always_false() since it's a simple get function
      providing access to data.
    
    Change-Id: I79504813024abaf452e2c2b244391024b442d6f7
    Reviewed-on: http://gerrit.cloudera.org:8080/15396
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Tested-by: Kudu Jenkins
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
---
 src/kudu/util/block_bloom_filter-test.cc |  4 ++--
 src/kudu/util/block_bloom_filter.cc      | 34 ++++++++++++++++++++------------
 src/kudu/util/block_bloom_filter.h       | 22 +++++++++++++++------
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/src/kudu/util/block_bloom_filter-test.cc b/src/kudu/util/block_bloom_filter-test.cc
index b9ed9f2..8e12099 100644
--- a/src/kudu/util/block_bloom_filter-test.cc
+++ b/src/kudu/util/block_bloom_filter-test.cc
@@ -308,9 +308,9 @@ TEST_F(BlockBloomFilterTest, Or) {
   BlockBloomFilter* bf3 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
   BlockBloomFilter* always_false = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
   ASSERT_OK(bf3->Or(*always_false));
-  EXPECT_TRUE(bf3->AlwaysFalse());
+  EXPECT_TRUE(bf3->always_false());
   ASSERT_OK(bf3->Or(*bf2));
-  EXPECT_FALSE(bf3->AlwaysFalse());
+  EXPECT_FALSE(bf3->always_false());
 
   // Invalid argument test cases.
   BlockBloomFilter* bf4 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01));
diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc
index acaf358..40a3d64 100644
--- a/src/kudu/util/block_bloom_filter.cc
+++ b/src/kudu/util/block_bloom_filter.cc
@@ -118,24 +118,32 @@ Status BlockBloomFilter::Init(const int log_space_bytes, HashAlgorithm hash_algo
   return Status::OK();
 }
 
+Status BlockBloomFilter::InitFromDirectory(int log_space_bytes,
+                                           const Slice& directory,
+                                           bool always_false,
+                                           HashAlgorithm hash_algorithm,
+                                           uint32_t hash_seed) {
+  RETURN_NOT_OK(InitInternal(log_space_bytes, hash_algorithm, hash_seed));
+  DCHECK_NOTNULL(directory_);
+
+  if (directory_size() != directory.size()) {
+    return Status::InvalidArgument(
+        Substitute("Mismatch in BlockBloomFilter source directory size $0 and expected size $1",
+                   directory.size(), directory_size()));
+  }
+  memcpy(directory_, directory.data(), directory.size());
+  always_false_ = always_false;
+  return Status();
+}
+
 Status BlockBloomFilter::InitFromPB(const BlockBloomFilterPB& bf_src) {
   if (!bf_src.has_log_space_bytes() || !bf_src.has_bloom_data() ||
       !bf_src.has_hash_algorithm() || !bf_src.has_always_false()) {
     return Status::InvalidArgument("Missing arguments to initialize BlockBloomFilter.");
   }
 
-  RETURN_NOT_OK(InitInternal(bf_src.log_space_bytes(), bf_src.hash_algorithm(),
-                             bf_src.hash_seed()));
-  DCHECK_NOTNULL(directory_);
-
-  if (directory_size() != bf_src.bloom_data().size()) {
-    return Status::InvalidArgument(
-        Substitute("Mismatch in BlockBloomFilter source directory size $0 and expected size $1",
-                   bf_src.bloom_data().size(), directory_size()));
-  }
-  memcpy(directory_, bf_src.bloom_data().data(), bf_src.bloom_data().size());
-  always_false_ = bf_src.always_false();
-  return Status::OK();
+  return InitFromDirectory(bf_src.log_space_bytes(), bf_src.bloom_data(), bf_src.always_false(),
+                           bf_src.hash_algorithm(), bf_src.hash_seed());
 }
 
 Status BlockBloomFilter::Clone(BlockBloomFilterBufferAllocatorIf* allocator,
@@ -303,7 +311,7 @@ Status BlockBloomFilter::Or(const BlockBloomFilter& other) {
     return Status::InvalidArgument(Substitute("Directory size don't match. this: $0, other: $1",
         directory_size(), other.directory_size()));
   }
-  if (other.AlwaysFalse()) {
+  if (other.always_false()) {
     // Nothing to do.
     return Status::OK();
   }
diff --git a/src/kudu/util/block_bloom_filter.h b/src/kudu/util/block_bloom_filter.h
index 8a0fdf8..6a9a991 100644
--- a/src/kudu/util/block_bloom_filter.h
+++ b/src/kudu/util/block_bloom_filter.h
@@ -91,6 +91,10 @@ class BlockBloomFilter {
   Status Init(int log_space_bytes, HashAlgorithm hash_algorithm, uint32_t hash_seed);
   // Initialize the BlockBloomFilter by de-serializing the protobuf message.
   Status InitFromPB(const BlockBloomFilterPB& bf_src);
+  // Initialize the BlockBloomFilter from a populated "directory" structure.
+  // Useful for initializing the BlockBloomFilter by de-serializing a custom protobuf message.
+  Status InitFromDirectory(int log_space_bytes, const Slice& directory, bool always_false,
+                           HashAlgorithm hash_algorithm, uint32_t hash_seed);
 
   // Clones the BlockBloomFilter using the supplied "allocator". The allocator is expected
   // to remain valid during the lifetime of the cloned BlockBloomFilter.
@@ -156,10 +160,21 @@ class BlockBloomFilter {
   Status Or(const BlockBloomFilter& other);
 
   // Returns whether the Bloom filter is empty and hence would return false for all lookups.
-  bool AlwaysFalse() const {
+  bool always_false() const {
     return always_false_;
   }
 
+  // Returns amount of space used in log2 bytes.
+  int log_space_bytes() const {
+    return log_num_buckets_ + kLogBucketByteSize;
+  }
+
+  // Returns the directory structure. Useful for serializing the BlockBloomFilter to
+  // a custom protobuf message.
+  Slice directory() const {
+    return Slice(reinterpret_cast<const uint8_t*>(directory_), directory_size());
+  }
+
   // Representation of a filter which allows all elements to pass.
   static constexpr BlockBloomFilter* const kAlwaysTrueFilter = nullptr;
 
@@ -241,11 +256,6 @@ class BlockBloomFilter {
   decltype(&BlockBloomFilter::BucketFind) bucket_find_func_ptr_;
   decltype(&BlockBloomFilter::OrEqualArray) or_equal_array_func_ptr_;
 
-  // Returns amount of space used in log2 bytes.
-  int log_space_bytes() const {
-    return log_num_buckets_ + kLogBucketByteSize;
-  }
-
   // Size of the internal directory structure in bytes.
   int64_t directory_size() const {
     return 1ULL << log_space_bytes();