You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2023/06/07 01:55:28 UTC

[doris] 04/04: [Enhancement](merge-on-write) optimize bloom filter for primary key index (#20182)

This is an automated email from the ASF dual-hosted git repository.

zhangchen pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 8ea3dfcd101ec60eee963a80a59180796ebfca1f
Author: Xin Liao <li...@126.com>
AuthorDate: Wed May 31 09:49:15 2023 +0800

    [Enhancement](merge-on-write) optimize bloom filter for primary key index (#20182)
---
 be/src/olap/primary_key_index.cpp                  |  8 ++-
 .../segment_v2/bloom_filter_index_writer.cpp       | 59 +++++++++++++++++++++-
 .../rowset/segment_v2/bloom_filter_index_writer.h  | 39 ++++++++++++++
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
index 7c2c5fe16a..6d5dc0edd2 100644
--- a/be/src/olap/primary_key_index.cpp
+++ b/be/src/olap/primary_key_index.cpp
@@ -19,6 +19,9 @@
 
 #include "common/config.h"
 #include "io/fs/file_reader.h"
+#include "olap/olap_common.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
+#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
 #include "olap/rowset/segment_v2/encoding_info.h"
 
 namespace doris {
@@ -36,8 +39,9 @@ Status PrimaryKeyIndexBuilder::init() {
             new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
     RETURN_IF_ERROR(_primary_key_index_builder->init());
 
-    return segment_v2::BloomFilterIndexWriter::create(segment_v2::BloomFilterOptions(), type_info,
-                                                      &_bloom_filter_index_builder);
+    _bloom_filter_index_builder.reset(new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl(
+            segment_v2::BloomFilterOptions(), type_info));
+    return Status::OK();
 }
 
 Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
index 5542a6068d..c8f8cc8c50 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
@@ -25,7 +25,6 @@
 #include "olap/rowset/segment_v2/encoding_info.h"
 #include "olap/rowset/segment_v2/indexed_column_writer.h"
 #include "olap/types.h"
-#include "runtime/mem_pool.h"
 #include "util/faststring.h"
 #include "util/slice.h"
 
@@ -170,6 +169,64 @@ private:
 
 } // namespace
 
+void PrimaryKeyBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) {
+    const Slice* v = (const Slice*)values;
+    for (int i = 0; i < count; ++i) {
+        Slice new_value;
+        _type_info->deep_copy(&new_value, v, &_pool);
+        _values.push_back(new_value);
+        ++v;
+    }
+}
+
+Status PrimaryKeyBloomFilterIndexWriterImpl::flush() {
+    std::unique_ptr<BloomFilter> bf;
+    RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf));
+    RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
+    bf->set_has_null(_has_null);
+    for (auto& v : _values) {
+        Slice* s = (Slice*)&v;
+        bf->add_bytes(s->data, s->size);
+    }
+    _bf_buffer_size += bf->size();
+    _bfs.push_back(std::move(bf));
+    _values.clear();
+    _has_null = false;
+    return Status::OK();
+}
+
+Status PrimaryKeyBloomFilterIndexWriterImpl::finish(io::FileWriter* file_writer,
+                                                    ColumnIndexMetaPB* index_meta) {
+    if (_values.size() > 0) {
+        RETURN_IF_ERROR(flush());
+    }
+    index_meta->set_type(BLOOM_FILTER_INDEX);
+    BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index();
+    meta->set_hash_strategy(_bf_options.strategy);
+    meta->set_algorithm(BLOCK_BLOOM_FILTER);
+
+    // write bloom filters
+    const auto* bf_type_info = get_scalar_type_info<FieldType::OLAP_FIELD_TYPE_VARCHAR>();
+    IndexedColumnWriterOptions options;
+    options.write_ordinal_index = true;
+    options.write_value_index = false;
+    options.encoding = PLAIN_ENCODING;
+    IndexedColumnWriter bf_writer(options, bf_type_info, file_writer);
+    RETURN_IF_ERROR(bf_writer.init());
+    for (auto& bf : _bfs) {
+        Slice data(bf->data(), bf->size());
+        bf_writer.add(&data);
+    }
+    RETURN_IF_ERROR(bf_writer.finish(meta->mutable_bloom_filter()));
+    return Status::OK();
+}
+
+uint64_t PrimaryKeyBloomFilterIndexWriterImpl::size() {
+    uint64_t total_size = _bf_buffer_size;
+    total_size += _pool.total_allocated_bytes();
+    return total_size;
+}
+
 // TODO currently we don't support bloom filter index for tinyint/hll/float/double
 Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options,
                                       const TypeInfo* type_info,
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
index 8b9a945e1a..52df34ec68 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h
@@ -23,6 +23,10 @@
 #include "common/status.h"
 #include "gen_cpp/segment_v2.pb.h"
 #include "gutil/macros.h"
+#include "olap/rowset/segment_v2/bloom_filter.h"
+#include "runtime/mem_pool.h"
+#include "util/slice.h"
+#include "vec/common/arena.h"
 
 namespace doris {
 
@@ -58,5 +62,40 @@ private:
     DISALLOW_COPY_AND_ASSIGN(BloomFilterIndexWriter);
 };
 
+// For unique key with merge on write, the data for each segment is deduplicated.
+// Bloom filter doesn't need to use `set` for deduplication like
+// `BloomFilterIndexWriterImpl`, so vector can be used to accelerate.
+class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
+public:
+    explicit PrimaryKeyBloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options,
+                                                  const TypeInfo* type_info)
+            : _bf_options(bf_options),
+              _type_info(type_info),
+              _has_null(false),
+              _bf_buffer_size(0) {}
+
+    ~PrimaryKeyBloomFilterIndexWriterImpl() override = default;
+
+    void add_values(const void* values, size_t count) override;
+
+    void add_nulls(uint32_t count) override { _has_null = true; }
+
+    Status flush() override;
+
+    Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override;
+
+    uint64_t size() override;
+
+private:
+    BloomFilterOptions _bf_options;
+    const TypeInfo* _type_info;
+    MemPool _pool;
+    bool _has_null;
+    uint64_t _bf_buffer_size;
+    // distinct values
+    std::vector<Slice> _values;
+    std::vector<std::unique_ptr<BloomFilter>> _bfs;
+};
+
 } // namespace segment_v2
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org