You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/12/16 14:53:24 UTC

[GitHub] [doris] xiaokang commented on a diff in pull request #13035: [improvement](index) Support bitmap index can be applied with compound predicate when enable vectorized engine query

xiaokang commented on code in PR #13035:
URL: https://github.com/apache/doris/pull/13035#discussion_r1033059767


##########
be/src/vec/exprs/vectorized_fn_call.cpp:
##########
@@ -104,12 +104,36 @@ doris::Status VectorizedFnCall::execute(VExprContext* context, doris::vectorized
     size_t num_columns_without_result = block->columns();
     // prepare a column to save result
     block->insert({nullptr, _data_type, _expr_name});
+    if (_function->can_fast_execute()) {
+        bool ok = fast_execute(context->fn_context(_fn_context_index), *block, arguments,
+                                       num_columns_without_result, block->rows());
+        if (ok) {
+            *result_column_id = num_columns_without_result;
+            return Status::OK();
+        }
+    }
+
     RETURN_IF_ERROR(_function->execute(context->fn_context(_fn_context_index), *block, arguments,
                                        num_columns_without_result, block->rows(), false));
     *result_column_id = num_columns_without_result;
     return Status::OK();
 }
 
+bool VectorizedFnCall::fast_execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments,

Review Comment:
   the name fast_execute is not so reasonable. it may be better just do it inside execute.



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -822,6 +832,186 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(vectorized::VExpr* expr,
+                    VExprContext* expr_ctx, 
+                    PushDownType* pdt,
+                    std::vector<ColumnValueRangeType>* column_value_rangs,
+                    const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>& in_predicate_checker,
+                    const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>& eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot, &range_on_slot) ||
+                        _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot, &range_on_slot)) {
+                    ColumnValueRangeType active_range = *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range
+                    std::visit(
+                        [&](auto& value_range) {
+                           _normalize_binary_in_compound_predicate(
+                                    child_expr, expr_ctx, slot, value_range, pdt,
+                                    _get_compound_type_by_fn_name(compound_fn_name));
+                        },
+                        active_range);
+                    
+                    column_value_rangs->emplace_back(active_range);
+                }
+            } else if (TExprNodeType::COMPOUND_PRED == child_expr->node_type()) {
+                _normalize_compound_predicate(
+                            child_expr, expr_ctx, 
+                            pdt, column_value_rangs, 
+                            in_predicate_checker, eq_predicate_checker);
+            }
+        }
+    }
+    
+    return Status::OK();
+}
+
+template <PrimitiveType T>
+Status VScanNode::_normalize_binary_in_compound_predicate(
+                vectorized::VExpr* expr, VExprContext* expr_ctx,
+                SlotDescriptor* slot, ColumnValueRange<T>& range,
+                PushDownType* pdt, const TCompoundType::type& compound_type) {
+    DCHECK(expr->children().size() == 2);
+    if (TExprNodeType::BINARY_PRED == expr->node_type()) {
+        auto eq_checker = [](const std::string& fn_name) { return fn_name == "eq"; };
+        auto ne_checker = [](const std::string& fn_name) { return fn_name == "ne"; };
+        auto noneq_checker = [](const std::string& fn_name) {
+            return fn_name != "ne" && fn_name != "eq";
+        };
+
+        StringRef value;
+        int slot_ref_child = -1;
+        PushDownType eq_pdt =
+                _should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr),
+                                                   expr_ctx, &value, &slot_ref_child, eq_checker);
+        PushDownType ne_pdt =
+                _should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr),
+                                                   expr_ctx, &value, &slot_ref_child, ne_checker);
+        PushDownType noneq_pdt = _should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), 
+                                                    expr_ctx, &value, &slot_ref_child, noneq_checker);
+
+        if (eq_pdt == PushDownType::UNACCEPTABLE 
+                && ne_pdt == PushDownType::UNACCEPTABLE
+                && noneq_pdt == PushDownType::UNACCEPTABLE) {
+            return Status::OK();
+        }
+        DCHECK(slot_ref_child >= 0);
+
+        if (eq_pdt == PushDownType::ACCEPTABLE) {
+            auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(slot->type().precision,
+                                                                           slot->type().scale);
+            auto fn_name = std::string("");

Review Comment:
   empty fn_name?



##########
be/src/olap/column_predicate.h:
##########
@@ -32,6 +32,10 @@ namespace doris {
 class Schema;
 class RowBlockV2;
 
+struct PredicateParams {

Review Comment:
   is it necessary to add a type for a single string value?



##########
gensrc/thrift/PaloInternalService.thrift:
##########
@@ -513,13 +513,21 @@ struct TFetchDataResult {
     4: optional Status.TStatus status
 }
 
+enum TCompoundType {
+    UNKNOWN = 0,
+    AND = 1,
+    OR = 2,
+    NOT = 3,
+}
+
 struct TCondition {
     1:  required string column_name
     2:  required string condition_op
     3:  required list<string> condition_values
     // In delete condition, the different column may have same column name, need
     // using unique id to distinguish them
     4:  optional i32 column_unique_id
+    5:  optional TCompoundType compound_type = TCompoundType.UNKNOWN

Review Comment:
   what's the difference between condition_op and compound_type



##########
be/src/olap/rowset/rowset_reader_context.h:
##########
@@ -45,6 +46,7 @@ struct RowsetReaderContext {
     // column name -> column predicate
     // adding column_name for predicate to make use of column selectivity
     const std::vector<ColumnPredicate*>* predicates = nullptr;
+    const std::vector<ColumnPredicate*>* all_compound_predicates = nullptr;

Review Comment:
   a more intuitive name



##########
be/src/exec/olap_common.h:
##########
@@ -476,6 +518,18 @@ Status ColumnValueRange<primitive_type>::add_fixed_value(const CppType& value) {
     return Status::OK();
 }
 
+template <PrimitiveType primitive_type>
+Status ColumnValueRange<primitive_type>::add_compound_value(SQLFilterOp op, CppType value) {

Review Comment:
   place compound related code together for convinient



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -868,6 +880,193 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(
+        vectorized::VExpr* expr, VExprContext* expr_ctx, PushDownType* pdt,
+        std::vector<ColumnValueRangeType>* column_value_rangs,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                in_predicate_checker,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot,
+                                                 &range_on_slot) ||
+                    _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot,
+                                                 &range_on_slot)) {
+                    ColumnValueRangeType active_range =
+                            *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range
+                    std::visit(
+                            [&](auto& value_range) {
+                                _normalize_binary_in_compound_predicate(
+                                        child_expr, expr_ctx, slot, value_range, pdt,
+                                        _get_compound_type_by_fn_name(compound_fn_name));
+                            },
+                            active_range);
+
+                    column_value_rangs->emplace_back(active_range);
+                }
+            } else if (TExprNodeType::COMPOUND_PRED == child_expr->node_type()) {
+                _normalize_compound_predicate(child_expr, expr_ctx, pdt, column_value_rangs,
+                                              in_predicate_checker, eq_predicate_checker);
+            }

Review Comment:
   What about other node type such IN?



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -868,6 +880,193 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(
+        vectorized::VExpr* expr, VExprContext* expr_ctx, PushDownType* pdt,
+        std::vector<ColumnValueRangeType>* column_value_rangs,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                in_predicate_checker,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot,
+                                                 &range_on_slot) ||
+                    _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot,
+                                                 &range_on_slot)) {
+                    ColumnValueRangeType active_range =
+                            *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range

Review Comment:
   It's a copy, so we can use a child class to store aditional info.



##########
be/src/vec/exec/scan/vscan_node.h:
##########
@@ -197,7 +197,8 @@ class VScanNode : public ExecNode {
             _slot_id_to_value_range;
     // column -> ColumnValueRange
     std::unordered_map<std::string, ColumnValueRangeType> _colname_to_value_range;
-    // We use _colname_to_value_range to store a column and its corresponding value ranges.
+    std::vector<std::vector<ColumnValueRangeType>> _compound_value_ranges;

Review Comment:
   a comment is appreciated



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -447,6 +448,15 @@ VExpr* VScanNode::_normalize_predicate(VExpr* conjunct_expr_root) {
                         },
                         *range);
             }
+
+            if (pdt == PushDownType::UNACCEPTABLE && is_compound_predicate) {
+                std::vector<ColumnValueRangeType> column_value_rangs;

Review Comment:
   column_value_ranges



##########
be/src/vec/exec/scan/new_olap_scan_node.cpp:
##########
@@ -207,6 +207,29 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
         }
     }
 
+    for (auto i = 0; i < _compound_value_ranges.size(); ++i) {
+        std::vector<TCondition> conditions;
+        for (auto& iter : _compound_value_ranges[i]) {
+            std::vector<TCondition> filters;
+            std::visit([&](auto&& range) { 
+                if (range.is_boundary_value_range()) {
+                    range.to_boundary_condition(filters); 

Review Comment:
   what does boundary mean?



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -822,6 +832,186 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(vectorized::VExpr* expr,
+                    VExprContext* expr_ctx, 
+                    PushDownType* pdt,
+                    std::vector<ColumnValueRangeType>* column_value_rangs,
+                    const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>& in_predicate_checker,
+                    const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>& eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot, &range_on_slot) ||
+                        _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot, &range_on_slot)) {
+                    ColumnValueRangeType active_range = *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range
+                    std::visit(
+                        [&](auto& value_range) {
+                           _normalize_binary_in_compound_predicate(
+                                    child_expr, expr_ctx, slot, value_range, pdt,
+                                    _get_compound_type_by_fn_name(compound_fn_name));
+                        },
+                        active_range);
+                    
+                    column_value_rangs->emplace_back(active_range);
+                }

Review Comment:
   no else branch?



##########
be/src/olap/rowset/segment_v2/segment_iterator.h:
##########
@@ -226,6 +250,10 @@ class SegmentIterator : public RowwiseIterator {
     StorageReadOptions _opts;
     // make a copy of `_opts.column_predicates` in order to make local changes
     std::vector<ColumnPredicate*> _col_predicates;
+    std::vector<ColumnPredicate*> _all_compound_col_predicates;

Review Comment:
   suggest a more clear name _not_pushdown_col_predicates



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -373,6 +392,145 @@ Status SegmentIterator::_apply_bitmap_index() {
     return Status::OK();
 }
 
+bool SegmentIterator::_is_literal_node(const TExprNodeType::type& node_type) {
+    switch (node_type) {
+    case TExprNodeType::BOOL_LITERAL:
+    case TExprNodeType::INT_LITERAL:
+    case TExprNodeType::LARGE_INT_LITERAL:
+    case TExprNodeType::FLOAT_LITERAL:
+    case TExprNodeType::DECIMAL_LITERAL:
+    case TExprNodeType::STRING_LITERAL:
+    case TExprNodeType::DATE_LITERAL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+Status SegmentIterator::_execute_all_compound_predicates(vectorized::VExpr* expr) {
+    if (expr == nullptr) {
+        return Status::OK();
+    }
+
+    auto children = expr->children();
+    for (int i = 0; i < children.size(); ++i) {
+        RETURN_IF_ERROR(_execute_all_compound_predicates(children[i]));
+    }
+
+    auto node_type = expr->node_type();
+    if (node_type == TExprNodeType::SLOT_REF) {
+        _column_predicate_info->column_name = expr->expr_name();
+    } else if (_is_literal_node(node_type)) {
+        auto v_literal_expr =  dynamic_cast<doris::vectorized::VLiteral*>(expr);
+        _column_predicate_info->query_value = v_literal_expr->value();
+    } else if (node_type == TExprNodeType::BINARY_PRED) {
+        _column_predicate_info->query_op = expr->fn().name.function_name;
+        // get child condition result in compound condtions
+        auto column_sign = _gen_predicate_sign(_column_predicate_info.get());
+        _column_predicate_info.reset(new ColumnPredicateInfo());
+        if (_rowid_result_for_index.count(column_sign) > 0 
+                && _rowid_result_for_index[column_sign].first) {
+            auto apply_reuslt = _rowid_result_for_index[column_sign].second;
+            _compound_predicate_execute_result.push_back(apply_reuslt);
+        }
+    } else if (node_type == TExprNodeType::COMPOUND_PRED) {
+        auto function_name = expr->fn().name.function_name;
+        // execute logic function
+        RETURN_IF_ERROR(_execute_compound_fn(function_name));
+    }
+
+    return Status::OK();
+}
+
+Status SegmentIterator::_execute_compound_fn(const std::string& function_name) {
+    auto and_execute_result = [&]() {

Review Comment:
   using lambda is not simple here



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -352,6 +362,15 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
 Status SegmentIterator::_apply_bitmap_index() {
     SCOPED_RAW_TIMER(&_opts.stats->bitmap_index_filter_timer);
     size_t input_rows = _row_bitmap.cardinality();
+    if (config::enable_index_apply_compound_predicates) {
+        RETURN_IF_ERROR(_apply_index_in_compound());
+        if (_is_index_for_compound_predicate()) {
+            _execute_all_compound_predicates(_remaining_vconjunct_root);
+            DCHECK(_compound_predicate_execute_result.size() == 1);
+            _row_bitmap &= _compound_predicate_execute_result[0];

Review Comment:
   is it right to change _row_bitmap for none _col_predicates?



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -373,6 +392,145 @@ Status SegmentIterator::_apply_bitmap_index() {
     return Status::OK();
 }
 
+bool SegmentIterator::_is_literal_node(const TExprNodeType::type& node_type) {
+    switch (node_type) {
+    case TExprNodeType::BOOL_LITERAL:
+    case TExprNodeType::INT_LITERAL:
+    case TExprNodeType::LARGE_INT_LITERAL:
+    case TExprNodeType::FLOAT_LITERAL:
+    case TExprNodeType::DECIMAL_LITERAL:
+    case TExprNodeType::STRING_LITERAL:
+    case TExprNodeType::DATE_LITERAL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+Status SegmentIterator::_execute_all_compound_predicates(vectorized::VExpr* expr) {
+    if (expr == nullptr) {
+        return Status::OK();
+    }
+
+    auto children = expr->children();
+    for (int i = 0; i < children.size(); ++i) {
+        RETURN_IF_ERROR(_execute_all_compound_predicates(children[i]));
+    }
+
+    auto node_type = expr->node_type();
+    if (node_type == TExprNodeType::SLOT_REF) {
+        _column_predicate_info->column_name = expr->expr_name();
+    } else if (_is_literal_node(node_type)) {
+        auto v_literal_expr =  dynamic_cast<doris::vectorized::VLiteral*>(expr);
+        _column_predicate_info->query_value = v_literal_expr->value();
+    } else if (node_type == TExprNodeType::BINARY_PRED) {
+        _column_predicate_info->query_op = expr->fn().name.function_name;
+        // get child condition result in compound condtions
+        auto column_sign = _gen_predicate_sign(_column_predicate_info.get());
+        _column_predicate_info.reset(new ColumnPredicateInfo());
+        if (_rowid_result_for_index.count(column_sign) > 0 
+                && _rowid_result_for_index[column_sign].first) {
+            auto apply_reuslt = _rowid_result_for_index[column_sign].second;
+            _compound_predicate_execute_result.push_back(apply_reuslt);
+        }

Review Comment:
   should process else branch



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -101,7 +110,8 @@ Status Segment::new_iterator(const Schema& schema, const StorageReadOptions& rea
             continue;
         }
         int32_t uid = read_options.tablet_schema->column(column_id).unique_id();
-        if (_column_readers.count(uid) < 1 || !_column_readers.at(uid)->has_zone_map()) {
+        if (_column_readers.count(uid) < 1 || !_column_readers.at(uid)->has_zone_map() ||
+            cond_in_compound_query(column_id)) {

Review Comment:
   Is it necessary to add this check?
   
   If a column is in remaininig compound predicates, zone map should take effect to prune segment as normal.



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -1240,6 +1441,44 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
     return Status::OK();
 }
 
+void SegmentIterator::_output_index_return_column(vectorized::Block* block) {
+    if (block->rows() == 0) {
+        return;
+    }
+
+    for (auto column_sign : _rowid_result_for_index) {
+        block->insert({vectorized::ColumnUInt8::create(),
+                       std::make_shared<vectorized::DataTypeUInt8>(), column_sign.first});
+        if (!column_sign.second.first) {
+            // predicate not in compound query
+            continue;
+        }
+        _build_index_return_column(block, column_sign.first, column_sign.second.second);
+    }
+}
+
+void SegmentIterator::_build_index_return_column(vectorized::Block* block,

Review Comment:
   check for newest impl



##########
be/src/exec/olap_common.h:
##########
@@ -335,6 +373,10 @@ class ColumnValueRange {
                                                   primitive_type == PrimitiveType::TYPE_BOOLEAN ||
                                                   primitive_type == PrimitiveType::TYPE_DATETIME ||
                                                   primitive_type == PrimitiveType::TYPE_DATETIMEV2;
+
+    // range boundary value in CompoundPredicate
+    std::set<std::pair<SQLFilterOp, CppType>> _boundary_values;

Review Comment:
   what's the difference between _boundary_values and _low_value + _high_value?



##########
be/src/vec/exec/scan/new_olap_scan_node.cpp:
##########
@@ -232,6 +232,30 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
             }
         }
 
+        for (auto i = 0; i < _compound_value_ranges.size(); ++i) {
+            std::vector<TCondition> conditions;
+            for (auto& iter : _compound_value_ranges[i]) {
+                std::vector<TCondition> filters;
+                std::visit(
+                        [&](auto&& range) {
+                            if (range.is_boundary_value_range()) {
+                                range.to_boundary_condition(filters);
+                            } else {

Review Comment:
   is any chance for else branch?



##########
be/src/vec/exec/scan/new_olap_scanner.h:
##########
@@ -75,6 +77,7 @@ class NewOlapScanner : public VScanner {
 
     std::vector<uint32_t> _return_columns;
     std::unordered_set<uint32_t> _tablet_columns_convert_to_null_set;
+    std::vector<std::vector<TCondition>> _compound_filters;

Review Comment:
   a more intuitive name is better



##########
be/src/vec/exprs/vectorized_fn_call.cpp:
##########
@@ -104,12 +104,38 @@ doris::Status VectorizedFnCall::execute(VExprContext* context, doris::vectorized
     size_t num_columns_without_result = block->columns();
     // prepare a column to save result
     block->insert({nullptr, _data_type, _expr_name});
+    if (_function->can_fast_execute()) {

Review Comment:
   Is it necessary to set limit on fast execute? Is it ok to just check result column name?



##########
be/src/vec/exec/scan/new_olap_scan_node.h:
##########
@@ -65,6 +65,8 @@ class NewOlapScanNode : public VScanNode {
     std::vector<std::unique_ptr<TPaloScanRange>> _scan_ranges;
     OlapScanKeys _scan_keys;
     std::vector<TCondition> _olap_filters;
+    // compound filters in every conjunct, like: "(a or b) and (c or d)", (a or b) in conjuct[0], (c or d) in conjuct[1]
+    std::vector<std::vector<TCondition>> _compound_filters;

Review Comment:
   a more intuitive name is better



##########
be/src/olap/reader.h:
##########
@@ -204,6 +209,7 @@ class TabletReader {
     std::vector<bool> _is_lower_keys_included;
     std::vector<bool> _is_upper_keys_included;
     std::vector<ColumnPredicate*> _col_predicates;
+    std::vector<ColumnPredicate*> _all_compound_col_predicates;

Review Comment:
   a more intuitive name is better



##########
be/src/olap/iterators.h:
##########
@@ -82,6 +83,7 @@ class StorageReadOptions {
     // reader's column predicate, nullptr if not existed
     // used to fiter rows in row block
     std::vector<ColumnPredicate*> column_predicates;
+    std::vector<ColumnPredicate*> all_compound_column_predicates;

Review Comment:
   a more intuitive name is better



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -425,6 +425,7 @@ Status VScanNode::_normalize_predicate(VExpr* conjunct_expr_root, VExpr** output
             auto impl = conjunct_expr_root->get_impl();
             // If impl is not null, which means this a conjuncts from runtime filter.
             VExpr* cur_expr = impl ? const_cast<VExpr*>(impl) : conjunct_expr_root;
+            bool is_compound_predicate = TExprNodeType::COMPOUND_PRED == cur_expr->node_type();

Review Comment:
   can be placed closed to the check statement



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -868,6 +880,193 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(
+        vectorized::VExpr* expr, VExprContext* expr_ctx, PushDownType* pdt,
+        std::vector<ColumnValueRangeType>* column_value_rangs,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                in_predicate_checker,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot,
+                                                 &range_on_slot) ||
+                    _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot,
+                                                 &range_on_slot)) {
+                    ColumnValueRangeType active_range =
+                            *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range
+                    std::visit(

Review Comment:
   std::visit is not neccessary for a single active_range.



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -868,6 +880,193 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+Status VScanNode::_normalize_compound_predicate(
+        vectorized::VExpr* expr, VExprContext* expr_ctx, PushDownType* pdt,
+        std::vector<ColumnValueRangeType>* column_value_rangs,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                in_predicate_checker,
+        const std::function<bool(const std::vector<VExpr*>&, const VSlotRef**, VExpr**)>&
+                eq_predicate_checker) {
+    if (TExprNodeType::COMPOUND_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+        auto compound_fn_name = expr->fn().name.function_name;
+        auto children_num = expr->children().size();
+        for (auto i = 0; i < children_num; ++i) {
+            VExpr* child_expr = expr->children()[i];
+            if (TExprNodeType::BINARY_PRED == child_expr->node_type()) {
+                SlotDescriptor* slot = nullptr;
+                ColumnValueRangeType* range_on_slot = nullptr;
+                if (_is_predicate_acting_on_slot(child_expr, in_predicate_checker, &slot,
+                                                 &range_on_slot) ||
+                    _is_predicate_acting_on_slot(child_expr, eq_predicate_checker, &slot,
+                                                 &range_on_slot)) {
+                    ColumnValueRangeType active_range =
+                            *range_on_slot; // copy, in order not to affect the range in the _colname_to_value_range
+                    std::visit(
+                            [&](auto& value_range) {
+                                _normalize_binary_in_compound_predicate(
+                                        child_expr, expr_ctx, slot, value_range, pdt,
+                                        _get_compound_type_by_fn_name(compound_fn_name));
+                            },
+                            active_range);
+
+                    column_value_rangs->emplace_back(active_range);
+                }
+            } else if (TExprNodeType::COMPOUND_PRED == child_expr->node_type()) {
+                _normalize_compound_predicate(child_expr, expr_ctx, pdt, column_value_rangs,
+                                              in_predicate_checker, eq_predicate_checker);
+            }
+        }
+    }
+
+    return Status::OK();
+}
+
+template <PrimitiveType T>
+Status VScanNode::_normalize_binary_in_compound_predicate(
+        vectorized::VExpr* expr, VExprContext* expr_ctx, SlotDescriptor* slot,
+        ColumnValueRange<T>& range, PushDownType* pdt, const TCompoundType::type& compound_type) {
+    DCHECK(expr->children().size() == 2);
+    if (TExprNodeType::BINARY_PRED == expr->node_type()) {
+        auto eq_checker = [](const std::string& fn_name) { return fn_name == "eq"; };
+        auto ne_checker = [](const std::string& fn_name) { return fn_name == "ne"; };
+        auto noneq_checker = [](const std::string& fn_name) {
+            return fn_name != "ne" && fn_name != "eq";
+        };
+
+        StringRef value;
+        int slot_ref_child = -1;
+        PushDownType eq_pdt;
+        PushDownType ne_pdt;
+        PushDownType noneq_pdt;
+        RETURN_IF_ERROR(_should_push_down_binary_predicate(
+                reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, &value, &slot_ref_child,
+                eq_checker, eq_pdt));
+        RETURN_IF_ERROR(_should_push_down_binary_predicate(
+                reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, &value, &slot_ref_child,
+                ne_checker, ne_pdt));
+        RETURN_IF_ERROR(_should_push_down_binary_predicate(
+                reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, &value, &slot_ref_child,
+                noneq_checker, noneq_pdt));
+        if (eq_pdt == PushDownType::UNACCEPTABLE && ne_pdt == PushDownType::UNACCEPTABLE &&
+            noneq_pdt == PushDownType::UNACCEPTABLE) {
+            return Status::OK();
+        }
+        DCHECK(slot_ref_child >= 0);
+
+        if (eq_pdt == PushDownType::ACCEPTABLE) {
+            auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(
+                    slot->type().precision, slot->type().scale);
+            auto fn_name = std::string("");
+            if (value.data != nullptr) {
+                if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING ||
+                              T == TYPE_HLL) {
+                    auto val = StringValue(value.data, value.size);
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            temp_range, reinterpret_cast<void*>(&val),
+                            ColumnValueRange<T>::add_fixed_value_range, fn_name));
+                } else {
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            temp_range, reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                            ColumnValueRange<T>::add_fixed_value_range, fn_name));
+                }
+                range.intersection(temp_range);
+            }
+            *pdt = eq_pdt;
+
+            // exceed limit, no conditions will be pushed down to storage engine.
+            if (range.get_fixed_value_size() > _max_pushdown_conditions_per_column) {
+                range.set_whole_value_range();
+                *pdt = PushDownType::UNACCEPTABLE;
+            }
+            range.set_compound_type(compound_type);
+        }
+
+        if (ne_pdt == PushDownType::ACCEPTABLE) {
+            bool is_fixed_range = range.is_fixed_value_range();
+            auto not_in_range =
+                    ColumnValueRange<T>::create_empty_column_value_range(range.column_name());
+            auto fn_name = std::string("");
+            if (value.data != nullptr) {
+                auto fn_name = std::string("");
+                if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING ||
+                              T == TYPE_HLL) {
+                    auto val = StringValue(value.data, value.size);
+                    if (is_fixed_range) {
+                        RETURN_IF_ERROR(_change_value_range<true>(
+                                range, reinterpret_cast<void*>(&val),
+                                ColumnValueRange<T>::remove_fixed_value_range, fn_name));
+                    } else {
+                        RETURN_IF_ERROR(_change_value_range<true>(
+                                not_in_range, reinterpret_cast<void*>(&val),
+                                ColumnValueRange<T>::add_fixed_value_range, fn_name));
+                    }
+                } else {
+                    if (is_fixed_range) {
+                        RETURN_IF_ERROR(_change_value_range<true>(
+                                range, reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                                ColumnValueRange<T>::remove_fixed_value_range, fn_name));
+                    } else {
+                        RETURN_IF_ERROR(_change_value_range<true>(
+                                not_in_range,
+                                reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                                ColumnValueRange<T>::add_fixed_value_range, fn_name));
+                    }
+                }
+            }
+
+            if (is_fixed_range ||
+                not_in_range.get_fixed_value_size() <= _max_pushdown_conditions_per_column) {
+                if (!is_fixed_range) {
+                    _not_in_value_ranges.push_back(not_in_range);
+                }
+                *pdt = ne_pdt;
+            }
+            range.set_compound_type(compound_type);
+        }
+
+        if (noneq_pdt == PushDownType::ACCEPTABLE) {
+            const std::string& fn_name =
+                    reinterpret_cast<VectorizedFnCall*>(expr)->fn().name.function_name;
+
+            // where A = nullptr should return empty result set
+            if (value.data != nullptr) {
+                if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING ||
+                              T == TYPE_HLL) {
+                    auto val = StringValue(value.data, value.size);
+                    RETURN_IF_ERROR(_change_value_range<false>(
+                            range, reinterpret_cast<void*>(&val),
+                            ColumnValueRange<T>::add_compound_value_range, fn_name,

Review Comment:
   why use special add_compound_value_range here and use normal add_fixed_value_range in other branches?



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -352,6 +362,15 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
 Status SegmentIterator::_apply_bitmap_index() {
     SCOPED_RAW_TIMER(&_opts.stats->bitmap_index_filter_timer);
     size_t input_rows = _row_bitmap.cardinality();
+    if (config::enable_index_apply_compound_predicates) {
+        RETURN_IF_ERROR(_apply_index_in_compound());
+        if (_is_index_for_compound_predicate()) {
+            _execute_all_compound_predicates(_remaining_vconjunct_root);
+            DCHECK(_compound_predicate_execute_result.size() == 1);
+            _row_bitmap &= _compound_predicate_execute_result[0];

Review Comment:
   _is_index_for_compound_predicate should check all expr in _remaining_vconjunct_root



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org