You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2023/06/29 02:34:36 UTC

[doris] branch master updated: [improvement](olap) filter the whole segment by dictionary (#21239)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 7f0e37069f [improvement](olap) filter the whole segment by dictionary (#21239)
7f0e37069f is described below

commit 7f0e37069f83b319a20e59ddb49be289b13eedc0
Author: Jerry Hu <mr...@gmail.com>
AuthorDate: Thu Jun 29 10:34:29 2023 +0800

    [improvement](olap) filter the whole segment by dictionary (#21239)
---
 be/src/olap/block_column_predicate.cpp             | 15 ++++++
 be/src/olap/block_column_predicate.h               |  9 ++++
 be/src/olap/column_predicate.h                     |  4 ++
 be/src/olap/comparison_predicate.h                 | 13 +++++
 be/src/olap/in_list_predicate.h                    | 11 +++++
 be/src/olap/olap_common.h                          |  1 +
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 57 +++++++++++++++-------
 be/src/olap/rowset/segment_v2/column_reader.h      | 10 +++-
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 20 ++++++++
 be/src/vec/exec/scan/new_olap_scan_node.cpp        |  1 +
 be/src/vec/exec/scan/new_olap_scan_node.h          |  1 +
 be/src/vec/exec/scan/new_olap_scanner.cpp          |  1 +
 12 files changed, 125 insertions(+), 18 deletions(-)

diff --git a/be/src/olap/block_column_predicate.cpp b/be/src/olap/block_column_predicate.cpp
index ff99038012..8cfb89363c 100644
--- a/be/src/olap/block_column_predicate.cpp
+++ b/be/src/olap/block_column_predicate.cpp
@@ -52,6 +52,11 @@ bool SingleColumnBlockPredicate::evaluate_and(const segment_v2::BloomFilter* bf)
     return _predicate->evaluate_and(bf);
 }
 
+bool SingleColumnBlockPredicate::evaluate_and(const StringRef* dict_words,
+                                              const size_t dict_num) const {
+    return _predicate->evaluate_and(dict_words, dict_num);
+}
+
 void SingleColumnBlockPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
                                              uint16_t selected_size, bool* flags) const {
     auto column_id = _predicate->column_id();
@@ -158,6 +163,16 @@ bool AndBlockColumnPredicate::evaluate_and(const segment_v2::BloomFilter* bf) co
     return true;
 }
 
+bool AndBlockColumnPredicate::evaluate_and(const StringRef* dict_words,
+                                           const size_t dict_num) const {
+    for (auto* predicate : _block_column_predicate_vec) {
+        if (!predicate->evaluate_and(dict_words, dict_num)) {
+            return false;
+        }
+    }
+    return true;
+}
+
 void AndBlockColumnPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
                                           uint16_t selected_size, bool* flags) const {
     if (num_of_column_predicate() == 1) {
diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h
index 467962e809..0069a62d29 100644
--- a/be/src/olap/block_column_predicate.h
+++ b/be/src/olap/block_column_predicate.h
@@ -81,6 +81,12 @@ public:
         LOG(FATAL) << "should not reach here";
         return true;
     }
+
+    virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const {
+        LOG(FATAL) << "should not reach here";
+        return true;
+    }
+
     virtual bool can_do_bloom_filter() const { return false; }
 
     //evaluate predicate on inverted
@@ -109,6 +115,7 @@ public:
                       bool* flags) const override;
     bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& statistic) const override;
     bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
+    bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
     void evaluate_or(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size,
                      bool* flags) const override;
 
@@ -179,6 +186,8 @@ public:
 
     bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
 
+    bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
+
     bool can_do_bloom_filter() const override {
         for (auto& pred : _block_column_predicate_vec) {
             if (!pred->can_do_bloom_filter()) {
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 31dbea7e56..88f40c92c1 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -179,6 +179,10 @@ public:
 
     virtual bool evaluate_and(const BloomFilter* bf) const { return true; }
 
+    virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_count) const {
+        return true;
+    }
+
     virtual bool can_do_bloom_filter() const { return false; }
 
     // used to evaluate pre read column in lazy materialization
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
index 00de4632e7..6524fdfc7d 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -259,6 +259,19 @@ public:
         }
     }
 
+    bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+        if constexpr (std::is_same_v<T, StringRef>) {
+            for (size_t i = 0; i != count; ++i) {
+                if (_operator(dict_words[i], _value) ^ _opposite) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        return true;
+    }
+
     bool can_do_bloom_filter() const override { return PT == PredicateType::EQ; }
 
     void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size,
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 8af69efc66..5f0f99f7eb 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -346,6 +346,17 @@ public:
         }
     }
 
+    bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+        for (size_t i = 0; i != count; ++i) {
+            const auto found = _values->find(dict_words[i].data, dict_words[i].size) ^ _opposite;
+            if (found == (PT == PredicateType::IN_LIST)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
     bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
         if (statistic.first->is_null() || statistic.second->is_null()) {
             return false;
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 1ebf306b41..86f59af14c 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -324,6 +324,7 @@ struct OlapReaderStatistics {
     int64_t rows_key_range_filtered = 0;
     int64_t rows_stats_filtered = 0;
     int64_t rows_bf_filtered = 0;
+    int64_t rows_dict_filtered = 0;
     // Including the number of rows filtered out according to the Delete information in the Tablet,
     // and the number of rows filtered for marked deleted rows under the unique key model.
     // This metric is mainly used to record the number of rows filtered by the delete condition in Segment V1,
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 093eb365b0..5c3e67e0a2 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1170,23 +1170,8 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
         auto dict_page_decoder = reinterpret_cast<BinaryDictPageDecoder*>(_page.data_decoder.get());
         if (dict_page_decoder->is_dict_encoding()) {
             if (_dict_decoder == nullptr) {
-                // read dictionary page
-                Slice dict_data;
-                PageFooterPB dict_footer;
-                _opts.type = INDEX_PAGE;
-                RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(),
-                                                   &_dict_page_handle, &dict_data, &dict_footer,
-                                                   _compress_codec));
-                // ignore dict_footer.dict_page_footer().encoding() due to only
-                // PLAIN_ENCODING is supported for dict page right now
-                _dict_decoder = std::make_unique<
-                        BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>>(dict_data);
-                RETURN_IF_ERROR(_dict_decoder->init());
-
-                auto* pd_decoder = (BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>*)
-                                           _dict_decoder.get();
-                _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
-                pd_decoder->get_dict_word_info(_dict_word_info.get());
+                RETURN_IF_ERROR(_read_dict_data());
+                CHECK_NOTNULL(_dict_decoder);
             }
 
             dict_page_decoder->set_dict_decoder(_dict_decoder.get(), _dict_word_info.get());
@@ -1195,6 +1180,27 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
     return Status::OK();
 }
 
+Status FileColumnIterator::_read_dict_data() {
+    CHECK_EQ(_reader->encoding_info()->encoding(), DICT_ENCODING);
+    // read dictionary page
+    Slice dict_data;
+    PageFooterPB dict_footer;
+    _opts.type = INDEX_PAGE;
+    RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(), &_dict_page_handle,
+                                       &dict_data, &dict_footer, _compress_codec));
+    // ignore dict_footer.dict_page_footer().encoding() due to only
+    // PLAIN_ENCODING is supported for dict page right now
+    _dict_decoder =
+            std::make_unique<BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>>(dict_data);
+    RETURN_IF_ERROR(_dict_decoder->init());
+
+    auto* pd_decoder =
+            (BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>*)_dict_decoder.get();
+    _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
+    pd_decoder->get_dict_word_info(_dict_word_info.get());
+    return Status::OK();
+}
+
 Status FileColumnIterator::get_row_ranges_by_zone_map(
         const AndBlockColumnPredicate* col_predicates,
         const std::vector<const ColumnPredicate*>* delete_predicates, RowRanges* row_ranges) {
@@ -1213,6 +1219,23 @@ Status FileColumnIterator::get_row_ranges_by_bloom_filter(
     return Status::OK();
 }
 
+Status FileColumnIterator::get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+                                                  RowRanges* row_ranges) {
+    if (!_is_all_dict_encoding) {
+        return Status::OK();
+    }
+
+    if (!_dict_decoder) {
+        RETURN_IF_ERROR(_read_dict_data());
+        CHECK_NOTNULL(_dict_decoder);
+    }
+
+    if (!col_predicates->evaluate_and(_dict_word_info.get(), _dict_decoder->count())) {
+        row_ranges->clear();
+    }
+    return Status::OK();
+}
+
 Status DefaultValueColumnIterator::init(const ColumnIteratorOptions& opts) {
     _opts = opts;
     // be consistent with segment v1
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index bee0bcfb91..6cb9794b3b 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -302,6 +302,11 @@ public:
         return Status::OK();
     }
 
+    virtual Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+                                          RowRanges* row_ranges) {
+        return Status::OK();
+    }
+
     virtual bool is_all_dict_encoding() const { return false; }
 
 protected:
@@ -342,6 +347,9 @@ public:
     Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates,
                                           RowRanges* row_ranges) override;
 
+    Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+                                  RowRanges* row_ranges) override;
+
     ParsedPage* get_current_page() { return &_page; }
 
     bool is_nullable() { return _reader->is_nullable(); }
@@ -352,8 +360,8 @@ private:
     void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) const;
     Status _load_next_page(bool* eos);
     Status _read_data_page(const OrdinalPageIndexIterator& iter);
+    Status _read_dict_data();
 
-private:
     ColumnReader* _reader;
 
     // iterator owned compress codec, should NOT be shared by threads, initialized in init()
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 66a591635e..eaa3102d0b 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -491,6 +491,26 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
     RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges,
                                    condition_row_ranges);
     _opts.stats->rows_stats_filtered += (pre_size - condition_row_ranges->count());
+
+    /// Low cardinality optimization is currently not very stable, so to prevent data corruption,
+    /// we are temporarily disabling its use in data compaction.
+    if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
+        RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
+        for (auto cid : cids) {
+            RowRanges tmp_row_ranges = RowRanges::create_single(num_rows());
+            DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
+            uint32_t unique_cid = _schema->unique_id(cid);
+            RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_dict(
+                    _opts.col_id_to_predicates.at(cid).get(), &tmp_row_ranges));
+            RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, &dict_row_ranges);
+        }
+
+        pre_size = condition_row_ranges->count();
+        RowRanges::ranges_intersection(*condition_row_ranges, dict_row_ranges,
+                                       condition_row_ranges);
+        _opts.stats->rows_dict_filtered += (pre_size - condition_row_ranges->count());
+    }
+
     return Status::OK();
 }
 
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index 740a57e793..011bd3a3e4 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -144,6 +144,7 @@ Status NewOlapScanNode::_init_profile() {
 
     _stats_filtered_counter = ADD_COUNTER(_segment_profile, "RowsStatsFiltered", TUnit::UNIT);
     _bf_filtered_counter = ADD_COUNTER(_segment_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
+    _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered", TUnit::UNIT);
     _del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", TUnit::UNIT);
     _conditions_filtered_counter =
             ADD_COUNTER(_segment_profile, "RowsConditionsFiltered", TUnit::UNIT);
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h
index 21e2540350..fac6997c9d 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.h
+++ b/be/src/vec/exec/scan/new_olap_scan_node.h
@@ -135,6 +135,7 @@ private:
 
     RuntimeProfile::Counter* _stats_filtered_counter = nullptr;
     RuntimeProfile::Counter* _bf_filtered_counter = nullptr;
+    RuntimeProfile::Counter* _dict_filtered_counter = nullptr;
     RuntimeProfile::Counter* _del_filtered_counter = nullptr;
     RuntimeProfile::Counter* _conditions_filtered_counter = nullptr;
     RuntimeProfile::Counter* _key_range_filtered_counter = nullptr;
diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp
index ca56bc95bd..e6763bf94b 100644
--- a/be/src/vec/exec/scan/new_olap_scanner.cpp
+++ b/be/src/vec/exec/scan/new_olap_scanner.cpp
@@ -571,6 +571,7 @@ void NewOlapScanner::_update_counters_before_close() {
     }
 
     COUNTER_UPDATE(olap_parent->_stats_filtered_counter, stats.rows_stats_filtered);
+    COUNTER_UPDATE(olap_parent->_dict_filtered_counter, stats.rows_dict_filtered);
     COUNTER_UPDATE(olap_parent->_bf_filtered_counter, stats.rows_bf_filtered);
     COUNTER_UPDATE(olap_parent->_del_filtered_counter, stats.rows_del_filtered);
     COUNTER_UPDATE(olap_parent->_del_filtered_counter, stats.rows_del_by_bitmap);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org