You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by da...@apache.org on 2022/07/13 10:58:51 UTC
[doris] branch master updated: [feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)
This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 56b55563c6 [feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)
56b55563c6 is described below
commit 56b55563c6a95e2ab8aa7106e763330469ffe1f2
Author: Xin Liao <li...@126.com>
AuthorDate: Wed Jul 13 18:58:45 2022 +0800
[feature-wip](unique-key-merge-on-write) add bloom filter index for primary key, DSIP-018[1.2] (#10706)
---
be/src/olap/primary_key_index.cpp | 32 +++++++++++++++++++++++++-------
be/src/olap/primary_key_index.h | 25 +++++++++++++++++++------
be/test/olap/primary_key_index_test.cpp | 10 +++++++++-
gensrc/proto/segment_v2.proto | 10 ++++++++++
4 files changed, 63 insertions(+), 14 deletions(-)
diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
index 390f326949..a8ef37a7fa 100644
--- a/be/src/olap/primary_key_index.cpp
+++ b/be/src/olap/primary_key_index.cpp
@@ -30,12 +30,17 @@ Status PrimaryKeyIndexBuilder::init() {
options.encoding = segment_v2::EncodingInfo::get_default_encoding(type_info, true);
// TODO(liaoxin) test to confirm whether it needs to be compressed
options.compression = segment_v2::NO_COMPRESSION; // currently not compressed
- _index_builder.reset(new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
- return _index_builder->init();
+ _primary_key_index_builder.reset(
+ new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
+ RETURN_IF_ERROR(_primary_key_index_builder->init());
+
+ return segment_v2::BloomFilterIndexWriter::create(segment_v2::BloomFilterOptions(), type_info,
+ &_bloom_filter_index_builder);
}
Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
- _index_builder->add(&key);
+ RETURN_IF_ERROR(_primary_key_index_builder->add(&key));
+ _bloom_filter_index_builder->add_values(&key, 1);
// the key is already sorted, so the first key is min_key, and
// the last key is max_key.
if (UNLIKELY(_num_rows == 0)) {
@@ -48,17 +53,30 @@ Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
return Status::OK();
}
-Status PrimaryKeyIndexBuilder::finalize(segment_v2::IndexedColumnMetaPB* meta) {
+Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) {
// finish primary key index
- return _index_builder->finish(meta);
+ RETURN_IF_ERROR(_primary_key_index_builder->finish(meta->mutable_primary_key_index()));
+
+ // finish bloom filter index
+ RETURN_IF_ERROR(_bloom_filter_index_builder->flush());
+ return _bloom_filter_index_builder->finish(_file_writer, meta->mutable_bloom_filter_index());
}
Status PrimaryKeyIndexReader::parse(io::FileSystem* fs, const std::string& path,
- const segment_v2::IndexedColumnMetaPB& meta) {
+ const segment_v2::PrimaryKeyIndexMetaPB& meta) {
// parse primary key index
- _index_reader.reset(new segment_v2::IndexedColumnReader(fs, path, meta));
+ _index_reader.reset(new segment_v2::IndexedColumnReader(fs, path, meta.primary_key_index()));
RETURN_IF_ERROR(_index_reader->load(_use_page_cache, _kept_in_memory));
+ // parse bloom filter
+ segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index();
+ segment_v2::BloomFilterIndexReader bf_index_reader(fs, path,
+ &column_index_meta.bloom_filter_index());
+ RETURN_IF_ERROR(bf_index_reader.load(_use_page_cache, _kept_in_memory));
+ std::unique_ptr<segment_v2::BloomFilterIndexIterator> bf_iter;
+ RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter));
+ RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf));
+
_parsed = true;
return Status::OK();
}
diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h
index cbd9f5c0ce..faa1a6cf80 100644
--- a/be/src/olap/primary_key_index.h
+++ b/be/src/olap/primary_key_index.h
@@ -17,12 +17,12 @@
#pragma once
-#include <string>
-
#include "common/status.h"
#include "gen_cpp/segment_v2.pb.h"
-#include "io/fs/file_system.h"
#include "io/fs/file_writer.h"
+#include "olap/rowset/segment_v2/bloom_filter.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
#include "olap/rowset/segment_v2/indexed_column_reader.h"
#include "olap/rowset/segment_v2/indexed_column_writer.h"
#include "util/faststring.h"
@@ -50,7 +50,7 @@ public:
Slice min_key() { return Slice(_min_key); }
Slice max_key() { return Slice(_max_key); }
- Status finalize(segment_v2::IndexedColumnMetaPB* meta);
+ Status finalize(segment_v2::PrimaryKeyIndexMetaPB* meta);
private:
io::FileWriter* _file_writer = nullptr;
@@ -59,7 +59,8 @@ private:
faststring _min_key;
faststring _max_key;
- std::unique_ptr<segment_v2::IndexedColumnWriter> _index_builder;
+ std::unique_ptr<segment_v2::IndexedColumnWriter> _primary_key_index_builder;
+ std::unique_ptr<segment_v2::BloomFilterIndexWriter> _bloom_filter_index_builder;
};
class PrimaryKeyIndexReader {
@@ -67,7 +68,7 @@ public:
PrimaryKeyIndexReader() : _parsed(false) {}
Status parse(io::FileSystem* fs, const std::string& path,
- const segment_v2::IndexedColumnMetaPB& meta);
+ const segment_v2::PrimaryKeyIndexMetaPB& meta);
Status new_iterator(std::unique_ptr<segment_v2::IndexedColumnIterator>* index_iterator) const {
DCHECK(_parsed);
@@ -75,6 +76,17 @@ public:
return Status::OK();
}
+ const TypeInfo* type_info() const {
+ DCHECK(_parsed);
+ return _index_reader->type_info();
+ }
+
+ // verify whether exist in BloomFilter
+ bool check_present(const Slice& key) {
+ DCHECK(_parsed);
+ return _bf->test_bytes(key.data, key.size);
+ }
+
uint32_t num_rows() const {
DCHECK(_parsed);
return _index_reader->num_values();
@@ -85,6 +97,7 @@ private:
bool _use_page_cache = true;
bool _kept_in_memory = true;
std::unique_ptr<segment_v2::IndexedColumnReader> _index_reader;
+ std::unique_ptr<segment_v2::BloomFilter> _bf;
};
} // namespace doris
diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp
index ffa89ef399..ae0a701ef4 100644
--- a/be/test/olap/primary_key_index_test.cpp
+++ b/be/test/olap/primary_key_index_test.cpp
@@ -67,7 +67,7 @@ TEST_F(PrimaryKeyIndexTest, builder) {
}
EXPECT_EQ("1000", builder.min_key().to_string());
EXPECT_EQ("9998", builder.max_key().to_string());
- segment_v2::IndexedColumnMetaPB index_meta;
+ segment_v2::PrimaryKeyIndexMetaPB index_meta;
EXPECT_TRUE(builder.finalize(&index_meta));
EXPECT_TRUE(file_writer->close().ok());
EXPECT_EQ(num_rows, builder.num_rows());
@@ -81,6 +81,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
bool exact_match = false;
uint32_t row_id;
for (size_t i = 0; i < keys.size(); i++) {
+ bool exists = index_reader.check_present(keys[i]);
+ EXPECT_TRUE(exists);
auto status = index_iterator->seek_at_or_after(&keys[i], &exact_match);
EXPECT_TRUE(status.ok());
EXPECT_TRUE(exact_match);
@@ -91,6 +93,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
{
string key("8701");
Slice slice(key);
+ bool exists = index_reader.check_present(slice);
+ EXPECT_FALSE(exists);
auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
EXPECT_TRUE(status.ok());
EXPECT_FALSE(exact_match);
@@ -102,6 +106,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
{
string key("87");
Slice slice(key);
+ bool exists = index_reader.check_present(slice);
+ EXPECT_FALSE(exists);
auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
EXPECT_TRUE(status.ok());
EXPECT_FALSE(exact_match);
@@ -113,6 +119,8 @@ TEST_F(PrimaryKeyIndexTest, builder) {
{
string key("9999");
Slice slice(key);
+ bool exists = index_reader.check_present(slice);
+ EXPECT_FALSE(exists);
auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
EXPECT_FALSE(exact_match);
EXPECT_TRUE(status.is_not_found());
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 3f41977fdb..c7689c8fb6 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -162,6 +162,13 @@ message ColumnMetaPB {
}
+message PrimaryKeyIndexMetaPB {
+ // primary key index
+ optional IndexedColumnMetaPB primary_key_index = 1;
+ // bloom filter index
+ optional ColumnIndexMetaPB bloom_filter_index = 2;
+}
+
message SegmentFooterPB {
optional uint32 version = 1 [default = 1]; // file version
repeated ColumnMetaPB columns = 2; // tablet schema
@@ -175,6 +182,9 @@ message SegmentFooterPB {
// Short key index's page
optional PagePointerPB short_key_index_page = 9;
+
+ // Primary key index meta
+ optional PrimaryKeyIndexMetaPB primary_key_index_meta = 10;
}
message BTreeMetaPB {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org