You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by da...@apache.org on 2022/07/13 10:58:51 UTC

[doris] branch master updated: [feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)

This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 56b55563c6 [feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)
56b55563c6 is described below

commit 56b55563c6a95e2ab8aa7106e763330469ffe1f2
Author: Xin Liao <li...@126.com>
AuthorDate: Wed Jul 13 18:58:45 2022 +0800

    [feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)
---
 be/src/olap/primary_key_index.cpp       | 32 +++++++++++++++++++++++++-------
 be/src/olap/primary_key_index.h         | 25 +++++++++++++++++++------
 be/test/olap/primary_key_index_test.cpp | 10 +++++++++-
 gensrc/proto/segment_v2.proto           | 10 ++++++++++
 4 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
index 390f326949..a8ef37a7fa 100644
--- a/be/src/olap/primary_key_index.cpp
+++ b/be/src/olap/primary_key_index.cpp
@@ -30,12 +30,17 @@ Status PrimaryKeyIndexBuilder::init() {
     options.encoding = segment_v2::EncodingInfo::get_default_encoding(type_info, true);
     // TODO(liaoxin) test to confirm whether it needs to be compressed
     options.compression = segment_v2::NO_COMPRESSION; // currently not compressed
-    _index_builder.reset(new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
-    return _index_builder->init();
+    _primary_key_index_builder.reset(
+            new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
+    RETURN_IF_ERROR(_primary_key_index_builder->init());
+
+    return segment_v2::BloomFilterIndexWriter::create(segment_v2::BloomFilterOptions(), type_info,
+                                                      &_bloom_filter_index_builder);
 }
 
 Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
-    _index_builder->add(&key);
+    RETURN_IF_ERROR(_primary_key_index_builder->add(&key));
+    _bloom_filter_index_builder->add_values(&key, 1);
     // the key is already sorted, so the first key is min_key, and
     // the last key is max_key.
     if (UNLIKELY(_num_rows == 0)) {
@@ -48,17 +53,30 @@ Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
     return Status::OK();
 }
 
-Status PrimaryKeyIndexBuilder::finalize(segment_v2::IndexedColumnMetaPB* meta) {
+Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) {
     // finish primary key index
-    return _index_builder->finish(meta);
+    RETURN_IF_ERROR(_primary_key_index_builder->finish(meta->mutable_primary_key_index()));
+
+    // finish bloom filter index
+    RETURN_IF_ERROR(_bloom_filter_index_builder->flush());
+    return _bloom_filter_index_builder->finish(_file_writer, meta->mutable_bloom_filter_index());
 }
 
 Status PrimaryKeyIndexReader::parse(io::FileSystem* fs, const std::string& path,
-                                    const segment_v2::IndexedColumnMetaPB& meta) {
+                                    const segment_v2::PrimaryKeyIndexMetaPB& meta) {
     // parse primary key index
-    _index_reader.reset(new segment_v2::IndexedColumnReader(fs, path, meta));
+    _index_reader.reset(new segment_v2::IndexedColumnReader(fs, path, meta.primary_key_index()));
     RETURN_IF_ERROR(_index_reader->load(_use_page_cache, _kept_in_memory));
 
+    // parse bloom filter
+    segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index();
+    segment_v2::BloomFilterIndexReader bf_index_reader(fs, path,
+                                                       &column_index_meta.bloom_filter_index());
+    RETURN_IF_ERROR(bf_index_reader.load(_use_page_cache, _kept_in_memory));
+    std::unique_ptr<segment_v2::BloomFilterIndexIterator> bf_iter;
+    RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter));
+    RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf));
+
     _parsed = true;
     return Status::OK();
 }
diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h
index cbd9f5c0ce..faa1a6cf80 100644
--- a/be/src/olap/primary_key_index.h
+++ b/be/src/olap/primary_key_index.h
@@ -17,12 +17,12 @@
 
 #pragma once
 
-#include <string>
-
 #include "common/status.h"
 #include "gen_cpp/segment_v2.pb.h"
-#include "io/fs/file_system.h"
 #include "io/fs/file_writer.h"
+#include "olap/rowset/segment_v2/bloom_filter.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
 #include "olap/rowset/segment_v2/indexed_column_reader.h"
 #include "olap/rowset/segment_v2/indexed_column_writer.h"
 #include "util/faststring.h"
@@ -50,7 +50,7 @@ public:
     Slice min_key() { return Slice(_min_key); }
     Slice max_key() { return Slice(_max_key); }
 
-    Status finalize(segment_v2::IndexedColumnMetaPB* meta);
+    Status finalize(segment_v2::PrimaryKeyIndexMetaPB* meta);
 
 private:
     io::FileWriter* _file_writer = nullptr;
@@ -59,7 +59,8 @@ private:
 
     faststring _min_key;
     faststring _max_key;
-    std::unique_ptr<segment_v2::IndexedColumnWriter> _index_builder;
+    std::unique_ptr<segment_v2::IndexedColumnWriter> _primary_key_index_builder;
+    std::unique_ptr<segment_v2::BloomFilterIndexWriter> _bloom_filter_index_builder;
 };
 
 class PrimaryKeyIndexReader {
@@ -67,7 +68,7 @@ public:
     PrimaryKeyIndexReader() : _parsed(false) {}
 
     Status parse(io::FileSystem* fs, const std::string& path,
-                 const segment_v2::IndexedColumnMetaPB& meta);
+                 const segment_v2::PrimaryKeyIndexMetaPB& meta);
 
     Status new_iterator(std::unique_ptr<segment_v2::IndexedColumnIterator>* index_iterator) const {
         DCHECK(_parsed);
@@ -75,6 +76,17 @@ public:
         return Status::OK();
     }
 
+    const TypeInfo* type_info() const {
+        DCHECK(_parsed);
+        return _index_reader->type_info();
+    }
+
+    // verify whether exist in BloomFilter
+    bool check_present(const Slice& key) {
+        DCHECK(_parsed);
+        return _bf->test_bytes(key.data, key.size);
+    }
+
     uint32_t num_rows() const {
         DCHECK(_parsed);
         return _index_reader->num_values();
@@ -85,6 +97,7 @@ private:
     bool _use_page_cache = true;
     bool _kept_in_memory = true;
     std::unique_ptr<segment_v2::IndexedColumnReader> _index_reader;
+    std::unique_ptr<segment_v2::BloomFilter> _bf;
 };
 
 } // namespace doris
diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp
index ffa89ef399..ae0a701ef4 100644
--- a/be/test/olap/primary_key_index_test.cpp
+++ b/be/test/olap/primary_key_index_test.cpp
@@ -67,7 +67,7 @@ TEST_F(PrimaryKeyIndexTest, builder) {
     }
     EXPECT_EQ("1000", builder.min_key().to_string());
     EXPECT_EQ("9998", builder.max_key().to_string());
-    segment_v2::IndexedColumnMetaPB index_meta;
+    segment_v2::PrimaryKeyIndexMetaPB index_meta;
     EXPECT_TRUE(builder.finalize(&index_meta));
     EXPECT_TRUE(file_writer->close().ok());
     EXPECT_EQ(num_rows, builder.num_rows());
@@ -81,6 +81,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
     bool exact_match = false;
     uint32_t row_id;
     for (size_t i = 0; i < keys.size(); i++) {
+        bool exists = index_reader.check_present(keys[i]);
+        EXPECT_TRUE(exists);
         auto status = index_iterator->seek_at_or_after(&keys[i], &exact_match);
         EXPECT_TRUE(status.ok());
         EXPECT_TRUE(exact_match);
@@ -91,6 +93,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
     {
         string key("8701");
         Slice slice(key);
+        bool exists = index_reader.check_present(slice);
+        EXPECT_FALSE(exists);
         auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
         EXPECT_TRUE(status.ok());
         EXPECT_FALSE(exact_match);
@@ -102,6 +106,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
     {
         string key("87");
         Slice slice(key);
+        bool exists = index_reader.check_present(slice);
+        EXPECT_FALSE(exists);
         auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
         EXPECT_TRUE(status.ok());
         EXPECT_FALSE(exact_match);
@@ -113,6 +119,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
     {
         string key("9999");
         Slice slice(key);
+        bool exists = index_reader.check_present(slice);
+        EXPECT_FALSE(exists);
         auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
         EXPECT_FALSE(exact_match);
         EXPECT_TRUE(status.is_not_found());
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 3f41977fdb..c7689c8fb6 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -162,6 +162,13 @@ message ColumnMetaPB {
 
 }
 
+message PrimaryKeyIndexMetaPB {
+    // primary key index
+    optional IndexedColumnMetaPB primary_key_index = 1;
+    // bloom filter index
+    optional ColumnIndexMetaPB bloom_filter_index = 2;
+}
+
 message SegmentFooterPB {
     optional uint32 version = 1 [default = 1]; // file version
     repeated ColumnMetaPB columns = 2; // tablet schema
@@ -175,6 +182,9 @@ message SegmentFooterPB {
 
     // Short key index's page
     optional PagePointerPB short_key_index_page = 9;
+
+    // Primary key index meta
+    optional PrimaryKeyIndexMetaPB primary_key_index_meta = 10;
 }
 
 message BTreeMetaPB {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org