You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2023/06/29 02:34:36 UTC
[doris] branch master updated: [improvement](olap) filter the whole segment by dictionary (#21239)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7f0e37069f [improvement](olap) filter the whole segment by dictionary (#21239)
7f0e37069f is described below
commit 7f0e37069f83b319a20e59ddb49be289b13eedc0
Author: Jerry Hu <mr...@gmail.com>
AuthorDate: Thu Jun 29 10:34:29 2023 +0800
[improvement](olap) filter the whole segment by dictionary (#21239)
---
be/src/olap/block_column_predicate.cpp | 15 ++++++
be/src/olap/block_column_predicate.h | 9 ++++
be/src/olap/column_predicate.h | 4 ++
be/src/olap/comparison_predicate.h | 13 +++++
be/src/olap/in_list_predicate.h | 11 +++++
be/src/olap/olap_common.h | 1 +
be/src/olap/rowset/segment_v2/column_reader.cpp | 57 +++++++++++++++-------
be/src/olap/rowset/segment_v2/column_reader.h | 10 +++-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 20 ++++++++
be/src/vec/exec/scan/new_olap_scan_node.cpp | 1 +
be/src/vec/exec/scan/new_olap_scan_node.h | 1 +
be/src/vec/exec/scan/new_olap_scanner.cpp | 1 +
12 files changed, 125 insertions(+), 18 deletions(-)
diff --git a/be/src/olap/block_column_predicate.cpp b/be/src/olap/block_column_predicate.cpp
index ff99038012..8cfb89363c 100644
--- a/be/src/olap/block_column_predicate.cpp
+++ b/be/src/olap/block_column_predicate.cpp
@@ -52,6 +52,11 @@ bool SingleColumnBlockPredicate::evaluate_and(const segment_v2::BloomFilter* bf)
return _predicate->evaluate_and(bf);
}
+bool SingleColumnBlockPredicate::evaluate_and(const StringRef* dict_words,
+ const size_t dict_num) const {
+ return _predicate->evaluate_and(dict_words, dict_num);
+}
+
void SingleColumnBlockPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
uint16_t selected_size, bool* flags) const {
auto column_id = _predicate->column_id();
@@ -158,6 +163,16 @@ bool AndBlockColumnPredicate::evaluate_and(const segment_v2::BloomFilter* bf) co
return true;
}
+bool AndBlockColumnPredicate::evaluate_and(const StringRef* dict_words,
+ const size_t dict_num) const {
+ for (auto* predicate : _block_column_predicate_vec) {
+ if (!predicate->evaluate_and(dict_words, dict_num)) {
+ return false;
+ }
+ }
+ return true;
+}
+
void AndBlockColumnPredicate::evaluate_or(vectorized::MutableColumns& block, uint16_t* sel,
uint16_t selected_size, bool* flags) const {
if (num_of_column_predicate() == 1) {
diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h
index 467962e809..0069a62d29 100644
--- a/be/src/olap/block_column_predicate.h
+++ b/be/src/olap/block_column_predicate.h
@@ -81,6 +81,12 @@ public:
LOG(FATAL) << "should not reach here";
return true;
}
+
+ virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const {
+ LOG(FATAL) << "should not reach here";
+ return true;
+ }
+
virtual bool can_do_bloom_filter() const { return false; }
//evaluate predicate on inverted
@@ -109,6 +115,7 @@ public:
bool* flags) const override;
bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& statistic) const override;
bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
+ bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
void evaluate_or(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size,
bool* flags) const override;
@@ -179,6 +186,8 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
+ bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
+
bool can_do_bloom_filter() const override {
for (auto& pred : _block_column_predicate_vec) {
if (!pred->can_do_bloom_filter()) {
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 31dbea7e56..88f40c92c1 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -179,6 +179,10 @@ public:
virtual bool evaluate_and(const BloomFilter* bf) const { return true; }
+ virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_count) const {
+ return true;
+ }
+
virtual bool can_do_bloom_filter() const { return false; }
// used to evaluate pre read column in lazy materialization
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
index 00de4632e7..6524fdfc7d 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -259,6 +259,19 @@ public:
}
}
+ bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+ if constexpr (std::is_same_v<T, StringRef>) {
+ for (size_t i = 0; i != count; ++i) {
+ if (_operator(dict_words[i], _value) ^ _opposite) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return true;
+ }
+
bool can_do_bloom_filter() const override { return PT == PredicateType::EQ; }
void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size,
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 8af69efc66..5f0f99f7eb 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -346,6 +346,17 @@ public:
}
}
+ bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
+ for (size_t i = 0; i != count; ++i) {
+ const auto found = _values->find(dict_words[i].data, dict_words[i].size) ^ _opposite;
+ if (found == (PT == PredicateType::IN_LIST)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
if (statistic.first->is_null() || statistic.second->is_null()) {
return false;
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 1ebf306b41..86f59af14c 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -324,6 +324,7 @@ struct OlapReaderStatistics {
int64_t rows_key_range_filtered = 0;
int64_t rows_stats_filtered = 0;
int64_t rows_bf_filtered = 0;
+ int64_t rows_dict_filtered = 0;
// Including the number of rows filtered out according to the Delete information in the Tablet,
// and the number of rows filtered for marked deleted rows under the unique key model.
// This metric is mainly used to record the number of rows filtered by the delete condition in Segment V1,
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 093eb365b0..5c3e67e0a2 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1170,23 +1170,8 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
auto dict_page_decoder = reinterpret_cast<BinaryDictPageDecoder*>(_page.data_decoder.get());
if (dict_page_decoder->is_dict_encoding()) {
if (_dict_decoder == nullptr) {
- // read dictionary page
- Slice dict_data;
- PageFooterPB dict_footer;
- _opts.type = INDEX_PAGE;
- RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(),
- &_dict_page_handle, &dict_data, &dict_footer,
- _compress_codec));
- // ignore dict_footer.dict_page_footer().encoding() due to only
- // PLAIN_ENCODING is supported for dict page right now
- _dict_decoder = std::make_unique<
- BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>>(dict_data);
- RETURN_IF_ERROR(_dict_decoder->init());
-
- auto* pd_decoder = (BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>*)
- _dict_decoder.get();
- _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
- pd_decoder->get_dict_word_info(_dict_word_info.get());
+ RETURN_IF_ERROR(_read_dict_data());
+ CHECK_NOTNULL(_dict_decoder);
}
dict_page_decoder->set_dict_decoder(_dict_decoder.get(), _dict_word_info.get());
@@ -1195,6 +1180,27 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
return Status::OK();
}
+Status FileColumnIterator::_read_dict_data() {
+ CHECK_EQ(_reader->encoding_info()->encoding(), DICT_ENCODING);
+ // read dictionary page
+ Slice dict_data;
+ PageFooterPB dict_footer;
+ _opts.type = INDEX_PAGE;
+ RETURN_IF_ERROR(_reader->read_page(_opts, _reader->get_dict_page_pointer(), &_dict_page_handle,
+ &dict_data, &dict_footer, _compress_codec));
+ // ignore dict_footer.dict_page_footer().encoding() due to only
+ // PLAIN_ENCODING is supported for dict page right now
+ _dict_decoder =
+ std::make_unique<BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>>(dict_data);
+ RETURN_IF_ERROR(_dict_decoder->init());
+
+ auto* pd_decoder =
+ (BinaryPlainPageDecoder<FieldType::OLAP_FIELD_TYPE_VARCHAR>*)_dict_decoder.get();
+ _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
+ pd_decoder->get_dict_word_info(_dict_word_info.get());
+ return Status::OK();
+}
+
Status FileColumnIterator::get_row_ranges_by_zone_map(
const AndBlockColumnPredicate* col_predicates,
const std::vector<const ColumnPredicate*>* delete_predicates, RowRanges* row_ranges) {
@@ -1213,6 +1219,23 @@ Status FileColumnIterator::get_row_ranges_by_bloom_filter(
return Status::OK();
}
+Status FileColumnIterator::get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) {
+ if (!_is_all_dict_encoding) {
+ return Status::OK();
+ }
+
+ if (!_dict_decoder) {
+ RETURN_IF_ERROR(_read_dict_data());
+ CHECK_NOTNULL(_dict_decoder);
+ }
+
+ if (!col_predicates->evaluate_and(_dict_word_info.get(), _dict_decoder->count())) {
+ row_ranges->clear();
+ }
+ return Status::OK();
+}
+
Status DefaultValueColumnIterator::init(const ColumnIteratorOptions& opts) {
_opts = opts;
// be consistent with segment v1
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index bee0bcfb91..6cb9794b3b 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -302,6 +302,11 @@ public:
return Status::OK();
}
+ virtual Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) {
+ return Status::OK();
+ }
+
virtual bool is_all_dict_encoding() const { return false; }
protected:
@@ -342,6 +347,9 @@ public:
Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates,
RowRanges* row_ranges) override;
+ Status get_row_ranges_by_dict(const AndBlockColumnPredicate* col_predicates,
+ RowRanges* row_ranges) override;
+
ParsedPage* get_current_page() { return &_page; }
bool is_nullable() { return _reader->is_nullable(); }
@@ -352,8 +360,8 @@ private:
void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) const;
Status _load_next_page(bool* eos);
Status _read_data_page(const OrdinalPageIndexIterator& iter);
+ Status _read_dict_data();
-private:
ColumnReader* _reader;
// iterator owned compress codec, should NOT be shared by threads, initialized in init()
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 66a591635e..eaa3102d0b 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -491,6 +491,26 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges,
condition_row_ranges);
_opts.stats->rows_stats_filtered += (pre_size - condition_row_ranges->count());
+
+ /// Low cardinality optimization is currently not very stable, so to prevent data corruption,
+ /// we are temporarily disabling its use in data compaction.
+ if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
+ RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
+ for (auto cid : cids) {
+ RowRanges tmp_row_ranges = RowRanges::create_single(num_rows());
+ DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
+ uint32_t unique_cid = _schema->unique_id(cid);
+ RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_dict(
+ _opts.col_id_to_predicates.at(cid).get(), &tmp_row_ranges));
+ RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, &dict_row_ranges);
+ }
+
+ pre_size = condition_row_ranges->count();
+ RowRanges::ranges_intersection(*condition_row_ranges, dict_row_ranges,
+ condition_row_ranges);
+ _opts.stats->rows_dict_filtered += (pre_size - condition_row_ranges->count());
+ }
+
return Status::OK();
}
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index 740a57e793..011bd3a3e4 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -144,6 +144,7 @@ Status NewOlapScanNode::_init_profile() {
_stats_filtered_counter = ADD_COUNTER(_segment_profile, "RowsStatsFiltered", TUnit::UNIT);
_bf_filtered_counter = ADD_COUNTER(_segment_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
+ _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered", TUnit::UNIT);
_del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", TUnit::UNIT);
_conditions_filtered_counter =
ADD_COUNTER(_segment_profile, "RowsConditionsFiltered", TUnit::UNIT);
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h
index 21e2540350..fac6997c9d 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.h
+++ b/be/src/vec/exec/scan/new_olap_scan_node.h
@@ -135,6 +135,7 @@ private:
RuntimeProfile::Counter* _stats_filtered_counter = nullptr;
RuntimeProfile::Counter* _bf_filtered_counter = nullptr;
+ RuntimeProfile::Counter* _dict_filtered_counter = nullptr;
RuntimeProfile::Counter* _del_filtered_counter = nullptr;
RuntimeProfile::Counter* _conditions_filtered_counter = nullptr;
RuntimeProfile::Counter* _key_range_filtered_counter = nullptr;
diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp
index ca56bc95bd..e6763bf94b 100644
--- a/be/src/vec/exec/scan/new_olap_scanner.cpp
+++ b/be/src/vec/exec/scan/new_olap_scanner.cpp
@@ -571,6 +571,7 @@ void NewOlapScanner::_update_counters_before_close() {
}
COUNTER_UPDATE(olap_parent->_stats_filtered_counter, stats.rows_stats_filtered);
+ COUNTER_UPDATE(olap_parent->_dict_filtered_counter, stats.rows_dict_filtered);
COUNTER_UPDATE(olap_parent->_bf_filtered_counter, stats.rows_bf_filtered);
COUNTER_UPDATE(olap_parent->_del_filtered_counter, stats.rows_del_filtered);
COUNTER_UPDATE(olap_parent->_del_filtered_counter, stats.rows_del_by_bitmap);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org