You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/06/23 23:21:31 UTC

[doris] branch master updated: [improvement]Support vectorized predicates for dict columns (#10370)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 2e661ac63f [improvement]Support vectorized predicates for dict columns (#10370)
2e661ac63f is described below

commit 2e661ac63f77a227e6ee42d95535616d866f0b40
Author: Jerry Hu <mr...@gmail.com>
AuthorDate: Fri Jun 24 07:21:26 2022 +0800

    [improvement]Support vectorized predicates for dict columns (#10370)
---
 be/src/olap/comparison_predicate.cpp               | 69 +++++++++++----
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 19 ++++-
 be/src/olap/rowset/segment_v2/column_reader.h      | 16 ++++
 be/src/olap/rowset/segment_v2/parsed_page.h        |  8 ++
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 97 ++++++++++++----------
 be/src/olap/rowset/segment_v2/segment_iterator.h   |  2 +
 6 files changed, 148 insertions(+), 63 deletions(-)

diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp
index 91ef9f7156..3424bbb6c9 100644
--- a/be/src/olap/comparison_predicate.cpp
+++ b/be/src/olap/comparison_predicate.cpp
@@ -227,22 +227,20 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
 // todo(wb) for date type we use uint32_t to save it but using Predicate<uint24> to evaluate it.
 // This is done for compatibility with Row Version predicate.
 // We can use Predicate<uint32_t> for date after Row Version is removed.
-#define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP)                                           \
+#define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP, IS_RANGE)                                 \
     template <class T>                                                                           \
     void CLASS<T>::evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const { \
         if (column.is_nullable()) {                                                              \
             auto* nullable_column =                                                              \
                     vectorized::check_and_get_column<vectorized::ColumnNullable>(column);        \
-            auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<T>&>(      \
-                                       nullable_column->get_nested_column())                     \
-                                       .get_data();                                              \
             auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>(      \
                                         *(nullable_column->get_null_map_column_ptr()))           \
                                         .get_data();                                             \
+            auto& nested_col = nullable_column->get_nested_column();                             \
             if constexpr (std::is_same_v<T, uint24_t>) {                                         \
                 auto& predicate_column =                                                         \
                         reinterpret_cast<const vectorized::PredicateColumnType<uint32_t>&>(      \
-                                nullable_column->get_nested_column());                           \
+                                nested_col);                                                     \
                 uint32_t int32_val = 0;                                                          \
                 char* int32_val_ptr = (char*)&int32_val;                                         \
                 memory_copy(int32_val_ptr, _value.get_data(), sizeof(uint24_t));                 \
@@ -251,8 +249,28 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
                     flags[i] = (data_array_uint32_t[i] OP int32_val) && (!null_bitmap[i]);       \
                 }                                                                                \
             } else {                                                                             \
-                for (uint16_t i = 0; i < size; i++) {                                            \
-                    flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]);                   \
+                if (nested_col.is_column_dictionary()) {                                         \
+                    if constexpr (IS_RANGE) column.convert_dict_codes_if_necessary();            \
+                    if constexpr (std::is_same_v<T, StringValue>) {                              \
+                        auto dict_col = vectorized::check_and_get_column<                        \
+                                vectorized::ColumnDictionary<vectorized::Int32>>(nested_col);    \
+                        auto dict_code =                                                         \
+                                IS_RANGE ? dict_col->find_code_by_bound(_value, 1 OP 0, 1 OP 1)  \
+                                         : dict_col->find_code(_value);                          \
+                        auto& data_array = dict_col->get_data();                                 \
+                        for (uint16_t i = 0; i < size; i++) {                                    \
+                            flags[i] = (data_array[i] OP dict_code) && (!null_bitmap[i]);        \
+                        }                                                                        \
+                    }                                                                            \
+                } else {                                                                         \
+                    auto& data_array =                                                           \
+                            reinterpret_cast<const vectorized::PredicateColumnType<T>&>(         \
+                                    nested_col)                                                  \
+                                    .get_data();                                                 \
+                                                                                                 \
+                    for (uint16_t i = 0; i < size; i++) {                                        \
+                        flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]);               \
+                    }                                                                            \
                 }                                                                                \
             }                                                                                    \
         } else {                                                                                 \
@@ -267,11 +285,26 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
                     flags[i] = data_array[i] OP int32_val;                                       \
                 }                                                                                \
             } else {                                                                             \
-                auto& predicate_column =                                                         \
-                        reinterpret_cast<vectorized::PredicateColumnType<T>&>(column);           \
-                auto& data_array = predicate_column.get_data();                                  \
-                for (uint16_t i = 0; i < size; i++) {                                            \
-                    flags[i] = data_array[i] OP _value;                                          \
+                if (column.is_column_dictionary()) {                                             \
+                    if constexpr (IS_RANGE) column.convert_dict_codes_if_necessary();            \
+                    if constexpr (std::is_same_v<T, StringValue>) {                              \
+                        auto& dict_col = reinterpret_cast<                                       \
+                                vectorized::ColumnDictionary<vectorized::Int32>&>(column);       \
+                        auto dict_code =                                                         \
+                                IS_RANGE ? dict_col.find_code_by_bound(_value, 1 OP 0, 1 OP 1)   \
+                                         : dict_col.find_code(_value);                           \
+                        auto& data_array = dict_col.get_data();                                  \
+                        for (uint16_t i = 0; i < size; i++) {                                    \
+                            flags[i] = data_array[i] OP dict_code;                               \
+                        }                                                                        \
+                    }                                                                            \
+                } else {                                                                         \
+                    auto& predicate_column =                                                     \
+                            reinterpret_cast<vectorized::PredicateColumnType<T>&>(column);       \
+                    auto& data_array = predicate_column.get_data();                              \
+                    for (uint16_t i = 0; i < size; i++) {                                        \
+                        flags[i] = data_array[i] OP _value;                                      \
+                    }                                                                            \
                 }                                                                                \
             }                                                                                    \
         }                                                                                        \
@@ -282,12 +315,12 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
         }                                                                                        \
     }
 
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(EqualPredicate, ==)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(NotEqualPredicate, !=)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessPredicate, <)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessEqualPredicate, <=)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterPredicate, >)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(EqualPredicate, ==, false)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(NotEqualPredicate, !=, false)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessPredicate, <, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessEqualPredicate, <=, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterPredicate, >, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=, true)
 
 #define COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_BOOL(CLASS, OP, BOOL_NAME, BOOL_OP, SHORT_OP)    \
     template <class T>                                                                         \
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7e2ecae006..8b0c8bc649 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -93,7 +93,11 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ColumnMetaPB&
 
 ColumnReader::ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta,
                            uint64_t num_rows, FilePathDesc path_desc)
-        : _meta(meta), _opts(opts), _num_rows(num_rows), _path_desc(path_desc) {}
+        : _meta(meta),
+          _opts(opts),
+          _num_rows(num_rows),
+          _path_desc(path_desc),
+          _dict_encoding_type(UNKNOWN_DICT_ENCODING) {}
 
 ColumnReader::~ColumnReader() = default;
 
@@ -510,6 +514,19 @@ FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {
 Status FileColumnIterator::init(const ColumnIteratorOptions& opts) {
     _opts = opts;
     RETURN_IF_ERROR(get_block_compression_codec(_reader->get_compression(), _compress_codec));
+    if (config::enable_low_cardinality_optimize &&
+        _reader->encoding_info()->encoding() == DICT_ENCODING) {
+        auto dict_encoding_type = _reader->get_dict_encoding_type();
+        if (dict_encoding_type == ColumnReader::UNKNOWN_DICT_ENCODING) {
+            seek_to_ordinal(_reader->num_rows() - 1);
+            _is_all_dict_encoding = _page.is_dict_encoding;
+            _reader->set_dict_encoding_type(_is_all_dict_encoding
+                                                    ? ColumnReader::ALL_DICT_ENCODING
+                                                    : ColumnReader::PARTIAL_DICT_ENCODING);
+        } else {
+            _is_all_dict_encoding = dict_encoding_type == ColumnReader::ALL_DICT_ENCODING;
+        }
+    }
     return Status::OK();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index a365679ee7..1312a0eb89 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -90,6 +90,8 @@ public:
                          uint64_t num_rows, const FilePathDesc& path_desc,
                          std::unique_ptr<ColumnReader>* reader);
 
+    enum DictEncodingType { UNKNOWN_DICT_ENCODING, PARTIAL_DICT_ENCODING, ALL_DICT_ENCODING };
+
     ~ColumnReader();
 
     // create a new column iterator. Client should delete returned iterator
@@ -134,6 +136,12 @@ public:
 
     CompressionTypePB get_compression() const { return _meta.compression(); }
 
+    uint64_t num_rows() { return _num_rows; }
+
+    void set_dict_encoding_type(DictEncodingType type) { _dict_encoding_type = type; }
+
+    DictEncodingType get_dict_encoding_type() { return _dict_encoding_type; }
+
 private:
     ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta, uint64_t num_rows,
                  FilePathDesc path_desc);
@@ -174,6 +182,8 @@ private:
     uint64_t _num_rows;
     FilePathDesc _path_desc;
 
+    DictEncodingType _dict_encoding_type;
+
     TypeInfoPtr _type_info =
             TypeInfoPtr(nullptr, nullptr); // initialized in init(), may changed by subclasses.
     const EncodingInfo* _encoding_info =
@@ -244,6 +254,8 @@ public:
         return Status::OK();
     }
 
+    virtual bool is_all_dict_encoding() const { return false; }
+
 protected:
     ColumnIteratorOptions _opts;
 };
@@ -281,6 +293,8 @@ public:
 
     bool is_nullable() { return _reader->is_nullable(); }
 
+    bool is_all_dict_encoding() const override { return _is_all_dict_encoding; }
+
 private:
     void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) const;
     Status _load_next_page(bool* eos);
@@ -310,6 +324,8 @@ private:
     // current value ordinal
     ordinal_t _current_ordinal = 0;
 
+    bool _is_all_dict_encoding = false;
+
     std::unique_ptr<StringRef[]> _dict_word_info;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 1b05e5dbd3..c70ebd62a6 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -21,6 +21,7 @@
 
 #include "common/status.h"
 #include "gen_cpp/segment_v2.pb.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h"
 #include "olap/rowset/segment_v2/common.h"
 #include "olap/rowset/segment_v2/encoding_info.h"
 #include "olap/rowset/segment_v2/options.h"
@@ -56,6 +57,11 @@ struct ParsedPage {
         RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, &page->data_decoder));
         RETURN_IF_ERROR(page->data_decoder->init());
 
+        if (encoding->encoding() == DICT_ENCODING) {
+            auto dict_decoder = static_cast<BinaryDictPageDecoder*>(page->data_decoder);
+            page->is_dict_encoding = dict_decoder->is_dict_encoding();
+        }
+
         page->first_ordinal = footer.first_ordinal();
         page->num_rows = footer.num_values();
 
@@ -93,6 +99,8 @@ struct ParsedPage {
     // this means next row we will read
     ordinal_t offset_in_page = 0;
 
+    bool is_dict_encoding = false;
+
     bool contains(ordinal_t ord) {
         return ord >= first_ordinal && ord < (first_ordinal + num_rows);
     }
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 19a04c5576..69efdefff4 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -658,27 +658,20 @@ void SegmentIterator::_vec_init_lazy_materialization() {
 
         for (auto predicate : _col_predicates) {
             auto cid = predicate->column_id();
-            FieldType type = _schema.column(cid)->type();
             _is_pred_column[cid] = true;
             pred_column_ids.insert(cid);
 
             // Step1: check pred using short eval or vec eval
-            if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR ||
-                type == OLAP_FIELD_TYPE_STRING || predicate->type() == PredicateType::BF ||
-                predicate->type() == PredicateType::IN_LIST ||
-                predicate->type() == PredicateType::NOT_IN_LIST ||
-                predicate->type() == PredicateType::IS_NULL ||
-                predicate->type() == PredicateType::IS_NOT_NULL ||
-                type == OLAP_FIELD_TYPE_DECIMAL) {
-                short_cir_pred_col_id_set.insert(cid);
-                _short_cir_eval_predicate.push_back(predicate);
-            } else {
+            if (_can_evaluated_by_vectorized(predicate)) {
                 vec_pred_col_id_set.insert(predicate->column_id());
                 if (_pre_eval_block_predicate == nullptr) {
                     _pre_eval_block_predicate.reset(new AndBlockColumnPredicate());
                 }
                 _pre_eval_block_predicate->add_column_predicate(
                         new SingleColumnBlockPredicate(predicate));
+            } else {
+                short_cir_pred_col_id_set.insert(cid);
+                _short_cir_eval_predicate.push_back(predicate);
             }
         }
 
@@ -774,6 +767,30 @@ void SegmentIterator::_vec_init_lazy_materialization() {
     }
 }
 
+bool SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) {
+    auto cid = predicate->column_id();
+    FieldType field_type = _schema.column(cid)->type();
+    switch (predicate->type()) {
+    case PredicateType::EQ:
+    case PredicateType::NE:
+    case PredicateType::LE:
+    case PredicateType::LT:
+    case PredicateType::GE:
+    case PredicateType::GT: {
+        if (field_type == OLAP_FIELD_TYPE_VARCHAR || field_type == OLAP_FIELD_TYPE_CHAR ||
+            field_type == OLAP_FIELD_TYPE_STRING) {
+            return config::enable_low_cardinality_optimize &&
+                   _column_iterators[cid]->is_all_dict_encoding();
+        } else if (field_type == OLAP_FIELD_TYPE_DECIMAL) {
+            return false;
+        }
+        return true;
+    }
+    default:
+        return false;
+    }
+}
+
 void SegmentIterator::_vec_init_char_column_id() {
     for (size_t i = 0; i < _schema.num_column_ids(); i++) {
         auto cid = _schema.column_id(i);
@@ -881,44 +898,36 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_
     _pre_eval_block_predicate->evaluate_vec(_current_return_columns, selected_size, ret_flags);
 
     uint16_t new_size = 0;
-    size_t num_zeros = simd::count_zero_num(reinterpret_cast<int8_t*>(ret_flags), original_size);
-    if (0 == num_zeros) {
-        for (uint16_t i = 0; i < original_size; i++) {
-            sel_rowid_idx[i] = i;
-        }
-        new_size = original_size;
-    } else if (num_zeros == original_size) {
-        //no row pass, let new_size = 0
-    } else {
-        uint32_t sel_pos = 0;
-        const uint32_t sel_end = sel_pos + selected_size;
-        static constexpr size_t SIMD_BYTES = 32;
-        const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;
-
-        while (sel_pos < sel_end_simd) {
-            auto mask = simd::bytes32_mask_to_bits32_mask(ret_flags + sel_pos);
-            if (0 == mask) {
-                //pass
-            } else if (0xffffffff == mask) {
-                for (uint32_t i = 0; i < SIMD_BYTES; i++) {
-                    sel_rowid_idx[new_size++] = sel_pos + i;
-                }
-            } else {
-                while (mask) {
-                    const size_t bit_pos = __builtin_ctzll(mask);
-                    sel_rowid_idx[new_size++] = sel_pos + bit_pos;
-                    mask = mask & (mask - 1);
-                }
+
+    uint32_t sel_pos = 0;
+    const uint32_t sel_end = sel_pos + selected_size;
+    static constexpr size_t SIMD_BYTES = 32;
+    const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;
+
+    while (sel_pos < sel_end_simd) {
+        auto mask = simd::bytes32_mask_to_bits32_mask(ret_flags + sel_pos);
+        if (0 == mask) {
+            //pass
+        } else if (0xffffffff == mask) {
+            for (uint32_t i = 0; i < SIMD_BYTES; i++) {
+                sel_rowid_idx[new_size++] = sel_pos + i;
+            }
+        } else {
+            while (mask) {
+                const size_t bit_pos = __builtin_ctzll(mask);
+                sel_rowid_idx[new_size++] = sel_pos + bit_pos;
+                mask = mask & (mask - 1);
             }
-            sel_pos += SIMD_BYTES;
         }
+        sel_pos += SIMD_BYTES;
+    }
 
-        for (; sel_pos < sel_end; sel_pos++) {
-            if (ret_flags[sel_pos]) {
-                sel_rowid_idx[new_size++] = sel_pos;
-            }
+    for (; sel_pos < sel_end; sel_pos++) {
+        if (ret_flags[sel_pos]) {
+            sel_rowid_idx[new_size++] = sel_pos;
         }
     }
+
     _opts.stats->rows_vec_cond_filtered += original_size - new_size;
     return new_size;
 }
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 04c68699a0..fbf44c34da 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -119,6 +119,8 @@ private:
         return Status::OK();
     }
 
+    bool _can_evaluated_by_vectorized(ColumnPredicate* predicate);
+
 private:
     class BitmapRangeIterator;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org