You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2023/06/07 01:55:28 UTC
[doris] 04/04: [Enhancement](merge-on-write) optimize bloom filter for primary key index (#20182)
This is an automated email from the ASF dual-hosted git repository.
zhangchen pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8ea3dfcd101ec60eee963a80a59180796ebfca1f
Author: Xin Liao <li...@126.com>
AuthorDate: Wed May 31 09:49:15 2023 +0800
[Enhancement](merge-on-write) optimize bloom filter for primary key index (#20182)
---
be/src/olap/primary_key_index.cpp | 8 ++-
.../segment_v2/bloom_filter_index_writer.cpp | 59 +++++++++++++++++++++-
.../rowset/segment_v2/bloom_filter_index_writer.h | 39 ++++++++++++++
3 files changed, 103 insertions(+), 3 deletions(-)
diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
index 7c2c5fe16a..6d5dc0edd2 100644
--- a/be/src/olap/primary_key_index.cpp
+++ b/be/src/olap/primary_key_index.cpp
@@ -19,6 +19,9 @@
#include "common/config.h"
#include "io/fs/file_reader.h"
+#include "olap/olap_common.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
#include "olap/rowset/segment_v2/encoding_info.h"
namespace doris {
@@ -36,8 +39,9 @@ Status PrimaryKeyIndexBuilder::init() {
new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
RETURN_IF_ERROR(_primary_key_index_builder->init());
- return segment_v2::BloomFilterIndexWriter::create(segment_v2::BloomFilterOptions(), type_info,
- &_bloom_filter_index_builder);
+ _bloom_filter_index_builder.reset(new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl(
+ segment_v2::BloomFilterOptions(), type_info));
+ return Status::OK();
}
Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
index 5542a6068d..c8f8cc8c50 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
@@ -25,7 +25,6 @@
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/indexed_column_writer.h"
#include "olap/types.h"
-#include "runtime/mem_pool.h"
#include "util/faststring.h"
#include "util/slice.h"
@@ -170,6 +169,64 @@ private:
} // namespace
+void PrimaryKeyBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) {
+ const Slice* v = (const Slice*)values;
+ for (int i = 0; i < count; ++i) {
+ Slice new_value;
+ _type_info->deep_copy(&new_value, v, &_pool);
+ _values.push_back(new_value);
+ ++v;
+ }
+}
+
+Status PrimaryKeyBloomFilterIndexWriterImpl::flush() {
+ std::unique_ptr<BloomFilter> bf;
+ RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf));
+ RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
+ bf->set_has_null(_has_null);
+ for (auto& v : _values) {
+ Slice* s = (Slice*)&v;
+ bf->add_bytes(s->data, s->size);
+ }
+ _bf_buffer_size += bf->size();
+ _bfs.push_back(std::move(bf));
+ _values.clear();
+ _has_null = false;
+ return Status::OK();
+}
+
+Status PrimaryKeyBloomFilterIndexWriterImpl::finish(io::FileWriter* file_writer,
+ ColumnIndexMetaPB* index_meta) {
+ if (_values.size() > 0) {
+ RETURN_IF_ERROR(flush());
+ }
+ index_meta->set_type(BLOOM_FILTER_INDEX);
+ BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index();
+ meta->set_hash_strategy(_bf_options.strategy);
+ meta->set_algorithm(BLOCK_BLOOM_FILTER);
+
+ // write bloom filters
+ const auto* bf_type_info = get_scalar_type_info<FieldType::OLAP_FIELD_TYPE_VARCHAR>();
+ IndexedColumnWriterOptions options;
+ options.write_ordinal_index = true;
+ options.write_value_index = false;
+ options.encoding = PLAIN_ENCODING;
+ IndexedColumnWriter bf_writer(options, bf_type_info, file_writer);
+ RETURN_IF_ERROR(bf_writer.init());
+ for (auto& bf : _bfs) {
+ Slice data(bf->data(), bf->size());
+ bf_writer.add(&data);
+ }
+ RETURN_IF_ERROR(bf_writer.finish(meta->mutable_bloom_filter()));
+ return Status::OK();
+}
+
+uint64_t PrimaryKeyBloomFilterIndexWriterImpl::size() {
+ uint64_t total_size = _bf_buffer_size;
+ total_size += _pool.total_allocated_bytes();
+ return total_size;
+}
+
// TODO currently we don't support bloom filter index for tinyint/hll/float/double
Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options,
const TypeInfo* type_info,
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
index 8b9a945e1a..52df34ec68 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
@@ -23,6 +23,10 @@
#include "common/status.h"
#include "gen_cpp/segment_v2.pb.h"
#include "gutil/macros.h"
+#include "olap/rowset/segment_v2/bloom_filter.h"
+#include "runtime/mem_pool.h"
+#include "util/slice.h"
+#include "vec/common/arena.h"
namespace doris {
@@ -58,5 +62,40 @@ private:
DISALLOW_COPY_AND_ASSIGN(BloomFilterIndexWriter);
};
+// For unique key with merge on write, the data for each segment is deduplicated.
+// Bloom filter doesn't need to use `set` for deduplication like
+// `BloomFilterIndexWriterImpl`, so vector can be used to accelerate.
+class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
+public:
+ explicit PrimaryKeyBloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options,
+ const TypeInfo* type_info)
+ : _bf_options(bf_options),
+ _type_info(type_info),
+ _has_null(false),
+ _bf_buffer_size(0) {}
+
+ ~PrimaryKeyBloomFilterIndexWriterImpl() override = default;
+
+ void add_values(const void* values, size_t count) override;
+
+ void add_nulls(uint32_t count) override { _has_null = true; }
+
+ Status flush() override;
+
+ Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override;
+
+ uint64_t size() override;
+
+private:
+ BloomFilterOptions _bf_options;
+ const TypeInfo* _type_info;
+ MemPool _pool;
+ bool _has_null;
+ uint64_t _bf_buffer_size;
+ // distinct values
+ std::vector<Slice> _values;
+ std::vector<std::unique_ptr<BloomFilter>> _bfs;
+};
+
} // namespace segment_v2
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org