You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/06/23 23:21:31 UTC
[doris] branch master updated: [improvement]Support vectorized predicates for dict columns (#10370)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2e661ac63f [improvement]Support vectorized predicates for dict columns (#10370)
2e661ac63f is described below
commit 2e661ac63f77a227e6ee42d95535616d866f0b40
Author: Jerry Hu <mr...@gmail.com>
AuthorDate: Fri Jun 24 07:21:26 2022 +0800
[improvement]Support vectorized predicates for dict columns (#10370)
---
be/src/olap/comparison_predicate.cpp | 69 +++++++++++----
be/src/olap/rowset/segment_v2/column_reader.cpp | 19 ++++-
be/src/olap/rowset/segment_v2/column_reader.h | 16 ++++
be/src/olap/rowset/segment_v2/parsed_page.h | 8 ++
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 97 ++++++++++++----------
be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +
6 files changed, 148 insertions(+), 63 deletions(-)
diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp
index 91ef9f7156..3424bbb6c9 100644
--- a/be/src/olap/comparison_predicate.cpp
+++ b/be/src/olap/comparison_predicate.cpp
@@ -227,22 +227,20 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
// todo(wb) for date type we use uint32_t to save it but using Predicate<uint24> to evaluate it.
// This is done for compatibility with Row Version predicate.
// We can use Predicate<uint32_t> for date after Row Version is removed.
-#define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP) \
+#define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP, IS_RANGE) \
template <class T> \
void CLASS<T>::evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const { \
if (column.is_nullable()) { \
auto* nullable_column = \
vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \
- auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<T>&>( \
- nullable_column->get_nested_column()) \
- .get_data(); \
auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>( \
*(nullable_column->get_null_map_column_ptr())) \
.get_data(); \
+ auto& nested_col = nullable_column->get_nested_column(); \
if constexpr (std::is_same_v<T, uint24_t>) { \
auto& predicate_column = \
reinterpret_cast<const vectorized::PredicateColumnType<uint32_t>&>( \
- nullable_column->get_nested_column()); \
+ nested_col); \
uint32_t int32_val = 0; \
char* int32_val_ptr = (char*)&int32_val; \
memory_copy(int32_val_ptr, _value.get_data(), sizeof(uint24_t)); \
@@ -251,8 +249,28 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
flags[i] = (data_array_uint32_t[i] OP int32_val) && (!null_bitmap[i]); \
} \
} else { \
- for (uint16_t i = 0; i < size; i++) { \
- flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]); \
+ if (nested_col.is_column_dictionary()) { \
+ if constexpr (IS_RANGE) column.convert_dict_codes_if_necessary(); \
+ if constexpr (std::is_same_v<T, StringValue>) { \
+ auto dict_col = vectorized::check_and_get_column< \
+ vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); \
+ auto dict_code = \
+ IS_RANGE ? dict_col->find_code_by_bound(_value, 1 OP 0, 1 OP 1) \
+ : dict_col->find_code(_value); \
+ auto& data_array = dict_col->get_data(); \
+ for (uint16_t i = 0; i < size; i++) { \
+ flags[i] = (data_array[i] OP dict_code) && (!null_bitmap[i]); \
+ } \
+ } \
+ } else { \
+ auto& data_array = \
+ reinterpret_cast<const vectorized::PredicateColumnType<T>&>( \
+ nested_col) \
+ .get_data(); \
+ \
+ for (uint16_t i = 0; i < size; i++) { \
+ flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]); \
+ } \
} \
} \
} else { \
@@ -267,11 +285,26 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
flags[i] = data_array[i] OP int32_val; \
} \
} else { \
- auto& predicate_column = \
- reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \
- auto& data_array = predicate_column.get_data(); \
- for (uint16_t i = 0; i < size; i++) { \
- flags[i] = data_array[i] OP _value; \
+ if (column.is_column_dictionary()) { \
+ if constexpr (IS_RANGE) column.convert_dict_codes_if_necessary(); \
+ if constexpr (std::is_same_v<T, StringValue>) { \
+ auto& dict_col = reinterpret_cast< \
+ vectorized::ColumnDictionary<vectorized::Int32>&>(column); \
+ auto dict_code = \
+ IS_RANGE ? dict_col.find_code_by_bound(_value, 1 OP 0, 1 OP 1) \
+ : dict_col.find_code(_value); \
+ auto& data_array = dict_col.get_data(); \
+ for (uint16_t i = 0; i < size; i++) { \
+ flags[i] = data_array[i] OP dict_code; \
+ } \
+ } \
+ } else { \
+ auto& predicate_column = \
+ reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \
+ auto& data_array = predicate_column.get_data(); \
+ for (uint16_t i = 0; i < size; i++) { \
+ flags[i] = data_array[i] OP _value; \
+ } \
} \
} \
} \
@@ -282,12 +315,12 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
} \
}
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(EqualPredicate, ==)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(NotEqualPredicate, !=)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessPredicate, <)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessEqualPredicate, <=)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterPredicate, >)
-COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(EqualPredicate, ==, false)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(NotEqualPredicate, !=, false)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessPredicate, <, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(LessEqualPredicate, <=, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterPredicate, >, true)
+COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=, true)
#define COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_BOOL(CLASS, OP, BOOL_NAME, BOOL_OP, SHORT_OP) \
template <class T> \
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7e2ecae006..8b0c8bc649 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -93,7 +93,11 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ColumnMetaPB&
ColumnReader::ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta,
uint64_t num_rows, FilePathDesc path_desc)
- : _meta(meta), _opts(opts), _num_rows(num_rows), _path_desc(path_desc) {}
+ : _meta(meta),
+ _opts(opts),
+ _num_rows(num_rows),
+ _path_desc(path_desc),
+ _dict_encoding_type(UNKNOWN_DICT_ENCODING) {}
ColumnReader::~ColumnReader() = default;
@@ -510,6 +514,19 @@ FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {
Status FileColumnIterator::init(const ColumnIteratorOptions& opts) {
_opts = opts;
RETURN_IF_ERROR(get_block_compression_codec(_reader->get_compression(), _compress_codec));
+ if (config::enable_low_cardinality_optimize &&
+ _reader->encoding_info()->encoding() == DICT_ENCODING) {
+ auto dict_encoding_type = _reader->get_dict_encoding_type();
+ if (dict_encoding_type == ColumnReader::UNKNOWN_DICT_ENCODING) {
+ seek_to_ordinal(_reader->num_rows() - 1);
+ _is_all_dict_encoding = _page.is_dict_encoding;
+ _reader->set_dict_encoding_type(_is_all_dict_encoding
+ ? ColumnReader::ALL_DICT_ENCODING
+ : ColumnReader::PARTIAL_DICT_ENCODING);
+ } else {
+ _is_all_dict_encoding = dict_encoding_type == ColumnReader::ALL_DICT_ENCODING;
+ }
+ }
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index a365679ee7..1312a0eb89 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -90,6 +90,8 @@ public:
uint64_t num_rows, const FilePathDesc& path_desc,
std::unique_ptr<ColumnReader>* reader);
+ enum DictEncodingType { UNKNOWN_DICT_ENCODING, PARTIAL_DICT_ENCODING, ALL_DICT_ENCODING };
+
~ColumnReader();
// create a new column iterator. Client should delete returned iterator
@@ -134,6 +136,12 @@ public:
CompressionTypePB get_compression() const { return _meta.compression(); }
+ uint64_t num_rows() { return _num_rows; }
+
+ void set_dict_encoding_type(DictEncodingType type) { _dict_encoding_type = type; }
+
+ DictEncodingType get_dict_encoding_type() { return _dict_encoding_type; }
+
private:
ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& meta, uint64_t num_rows,
FilePathDesc path_desc);
@@ -174,6 +182,8 @@ private:
uint64_t _num_rows;
FilePathDesc _path_desc;
+ DictEncodingType _dict_encoding_type;
+
TypeInfoPtr _type_info =
TypeInfoPtr(nullptr, nullptr); // initialized in init(), may changed by subclasses.
const EncodingInfo* _encoding_info =
@@ -244,6 +254,8 @@ public:
return Status::OK();
}
+ virtual bool is_all_dict_encoding() const { return false; }
+
protected:
ColumnIteratorOptions _opts;
};
@@ -281,6 +293,8 @@ public:
bool is_nullable() { return _reader->is_nullable(); }
+ bool is_all_dict_encoding() const override { return _is_all_dict_encoding; }
+
private:
void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) const;
Status _load_next_page(bool* eos);
@@ -310,6 +324,8 @@ private:
// current value ordinal
ordinal_t _current_ordinal = 0;
+ bool _is_all_dict_encoding = false;
+
std::unique_ptr<StringRef[]> _dict_word_info;
};
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index 1b05e5dbd3..c70ebd62a6 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -21,6 +21,7 @@
#include "common/status.h"
#include "gen_cpp/segment_v2.pb.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h"
#include "olap/rowset/segment_v2/common.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/options.h"
@@ -56,6 +57,11 @@ struct ParsedPage {
RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, &page->data_decoder));
RETURN_IF_ERROR(page->data_decoder->init());
+ if (encoding->encoding() == DICT_ENCODING) {
+ auto dict_decoder = static_cast<BinaryDictPageDecoder*>(page->data_decoder);
+ page->is_dict_encoding = dict_decoder->is_dict_encoding();
+ }
+
page->first_ordinal = footer.first_ordinal();
page->num_rows = footer.num_values();
@@ -93,6 +99,8 @@ struct ParsedPage {
// this means next row we will read
ordinal_t offset_in_page = 0;
+ bool is_dict_encoding = false;
+
bool contains(ordinal_t ord) {
return ord >= first_ordinal && ord < (first_ordinal + num_rows);
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 19a04c5576..69efdefff4 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -658,27 +658,20 @@ void SegmentIterator::_vec_init_lazy_materialization() {
for (auto predicate : _col_predicates) {
auto cid = predicate->column_id();
- FieldType type = _schema.column(cid)->type();
_is_pred_column[cid] = true;
pred_column_ids.insert(cid);
// Step1: check pred using short eval or vec eval
- if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR ||
- type == OLAP_FIELD_TYPE_STRING || predicate->type() == PredicateType::BF ||
- predicate->type() == PredicateType::IN_LIST ||
- predicate->type() == PredicateType::NOT_IN_LIST ||
- predicate->type() == PredicateType::IS_NULL ||
- predicate->type() == PredicateType::IS_NOT_NULL ||
- type == OLAP_FIELD_TYPE_DECIMAL) {
- short_cir_pred_col_id_set.insert(cid);
- _short_cir_eval_predicate.push_back(predicate);
- } else {
+ if (_can_evaluated_by_vectorized(predicate)) {
vec_pred_col_id_set.insert(predicate->column_id());
if (_pre_eval_block_predicate == nullptr) {
_pre_eval_block_predicate.reset(new AndBlockColumnPredicate());
}
_pre_eval_block_predicate->add_column_predicate(
new SingleColumnBlockPredicate(predicate));
+ } else {
+ short_cir_pred_col_id_set.insert(cid);
+ _short_cir_eval_predicate.push_back(predicate);
}
}
@@ -774,6 +767,30 @@ void SegmentIterator::_vec_init_lazy_materialization() {
}
}
+bool SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) {
+ auto cid = predicate->column_id();
+ FieldType field_type = _schema.column(cid)->type();
+ switch (predicate->type()) {
+ case PredicateType::EQ:
+ case PredicateType::NE:
+ case PredicateType::LE:
+ case PredicateType::LT:
+ case PredicateType::GE:
+ case PredicateType::GT: {
+ if (field_type == OLAP_FIELD_TYPE_VARCHAR || field_type == OLAP_FIELD_TYPE_CHAR ||
+ field_type == OLAP_FIELD_TYPE_STRING) {
+ return config::enable_low_cardinality_optimize &&
+ _column_iterators[cid]->is_all_dict_encoding();
+ } else if (field_type == OLAP_FIELD_TYPE_DECIMAL) {
+ return false;
+ }
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
void SegmentIterator::_vec_init_char_column_id() {
for (size_t i = 0; i < _schema.num_column_ids(); i++) {
auto cid = _schema.column_id(i);
@@ -881,44 +898,36 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_
_pre_eval_block_predicate->evaluate_vec(_current_return_columns, selected_size, ret_flags);
uint16_t new_size = 0;
- size_t num_zeros = simd::count_zero_num(reinterpret_cast<int8_t*>(ret_flags), original_size);
- if (0 == num_zeros) {
- for (uint16_t i = 0; i < original_size; i++) {
- sel_rowid_idx[i] = i;
- }
- new_size = original_size;
- } else if (num_zeros == original_size) {
- //no row pass, let new_size = 0
- } else {
- uint32_t sel_pos = 0;
- const uint32_t sel_end = sel_pos + selected_size;
- static constexpr size_t SIMD_BYTES = 32;
- const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;
-
- while (sel_pos < sel_end_simd) {
- auto mask = simd::bytes32_mask_to_bits32_mask(ret_flags + sel_pos);
- if (0 == mask) {
- //pass
- } else if (0xffffffff == mask) {
- for (uint32_t i = 0; i < SIMD_BYTES; i++) {
- sel_rowid_idx[new_size++] = sel_pos + i;
- }
- } else {
- while (mask) {
- const size_t bit_pos = __builtin_ctzll(mask);
- sel_rowid_idx[new_size++] = sel_pos + bit_pos;
- mask = mask & (mask - 1);
- }
+
+ uint32_t sel_pos = 0;
+ const uint32_t sel_end = sel_pos + selected_size;
+ static constexpr size_t SIMD_BYTES = 32;
+ const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;
+
+ while (sel_pos < sel_end_simd) {
+ auto mask = simd::bytes32_mask_to_bits32_mask(ret_flags + sel_pos);
+ if (0 == mask) {
+ //pass
+ } else if (0xffffffff == mask) {
+ for (uint32_t i = 0; i < SIMD_BYTES; i++) {
+ sel_rowid_idx[new_size++] = sel_pos + i;
+ }
+ } else {
+ while (mask) {
+ const size_t bit_pos = __builtin_ctzll(mask);
+ sel_rowid_idx[new_size++] = sel_pos + bit_pos;
+ mask = mask & (mask - 1);
}
- sel_pos += SIMD_BYTES;
}
+ sel_pos += SIMD_BYTES;
+ }
- for (; sel_pos < sel_end; sel_pos++) {
- if (ret_flags[sel_pos]) {
- sel_rowid_idx[new_size++] = sel_pos;
- }
+ for (; sel_pos < sel_end; sel_pos++) {
+ if (ret_flags[sel_pos]) {
+ sel_rowid_idx[new_size++] = sel_pos;
}
}
+
_opts.stats->rows_vec_cond_filtered += original_size - new_size;
return new_size;
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 04c68699a0..fbf44c34da 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -119,6 +119,8 @@ private:
return Status::OK();
}
+ bool _can_evaluated_by_vectorized(ColumnPredicate* predicate);
+
private:
class BitmapRangeIterator;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org