You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/12/15 01:29:18 UTC
[incubator-doris] branch master updated: [Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0a0e46f  [Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
0a0e46f is described below

commit 0a0e46fd5393c7abefd7b3efe80fec2dc4aa2caf
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Tue Dec 15 09:29:10 2020 +0800

    [Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
    
    And Refactor ColumnRangeValue and OlapScanNode
    
    This patch mainly do the following:
    - Fix issue #5071
    - Change type_min in ColumnRangeValue as static
    - Add Class of type_limit make code clear
    - Refactor the function of normalize_in_and_eq_predicate
---
 be/src/exec/olap_common.cpp       |   5 +
 be/src/exec/olap_common.h         | 129 +++++++++---
 be/src/exec/olap_scan_node.cpp    | 406 +++++++++++++++++---------------------
 be/src/exec/olap_scan_node.h      |   8 +
 be/src/exec/olap_scanner.h        |   2 +-
 be/src/runtime/datetime_value.cpp |   2 -
 be/src/runtime/datetime_value.h   |  14 +-
 be/src/runtime/string_value.cpp   |  12 ++
 be/src/runtime/string_value.h     |   7 +
 be/src/runtime/type_limit.h       |  80 ++++++++
 10 files changed, 398 insertions(+), 267 deletions(-)

diff --git a/be/src/exec/olap_common.cpp b/be/src/exec/olap_common.cpp
index 6d51fb6..54c584e 100644
--- a/be/src/exec/olap_common.cpp
+++ b/be/src/exec/olap_common.cpp
@@ -36,6 +36,11 @@ std::string cast_to_string(__int128 value) {
 }
 
 template <>
+std::string cast_to_string(int8_t value) {
+    return std::to_string(static_cast<int>(value));
+}
+
+template <>
 void ColumnValueRange<StringValue>::convert_to_fixed_value() {
     return;
 }
diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index b4cf31e..c952f7b 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -31,7 +31,7 @@
 #include "exec/scan_node.h"
 #include "gen_cpp/PlanNodes_types.h"
 #include "olap/tuple.h"
-#include "runtime/datetime_value.h"
+#include "runtime/type_limit.h"
 #include "runtime/descriptors.h"
 #include "runtime/string_value.hpp"
 
@@ -42,6 +42,11 @@ std::string cast_to_string(T value) {
     return boost::lexical_cast<std::string>(value);
 }
 
+// TYPE_TINYINT should cast to int32_t to first
+// because it need to convert to num not char for build Olap fetch Query
+template <>
+std::string cast_to_string(int8_t);
+
 /**
  * @brief Column's value range
  **/
@@ -50,7 +55,8 @@ class ColumnValueRange {
 public:
     typedef typename std::set<T>::iterator iterator_type;
     ColumnValueRange();
-    ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max);
+    ColumnValueRange(std::string col_name, PrimitiveType type);
+    ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max);
 
     // should add fixed value before add range
     Status add_fixed_value(T value);
@@ -73,10 +79,12 @@ public:
 
     bool has_intersection(ColumnValueRange<T>& range);
 
+    void intersection(ColumnValueRange<T>& range);
+
     void set_empty_value_range() {
         _fixed_values.clear();
-        _low_value = _type_max;
-        _high_value = _type_min;
+        _low_value = TYPE_MAX;
+        _high_value = TYPE_MIN;
     }
 
     const std::set<T>& get_fixed_value_set() const { return _fixed_values; }
@@ -85,9 +93,9 @@ public:
 
     T get_range_min_value() const { return _low_value; }
 
-    bool is_low_value_mininum() const { return _low_value == _type_min; }
+    bool is_low_value_mininum() const { return _low_value == TYPE_MIN; }
 
-    bool is_high_value_maximum() const { return _high_value == _type_max; }
+    bool is_high_value_maximum() const { return _high_value == TYPE_MAX; }
 
     bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; }
 
@@ -112,7 +120,7 @@ public:
             }
         } else {
             TCondition low;
-            if (_type_min != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
+            if (TYPE_MIN != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
                 low.__set_column_name(_column_name);
                 low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>"));
                 low.condition_values.push_back(cast_to_string(_low_value));
@@ -123,7 +131,7 @@ public:
             }
 
             TCondition high;
-            if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
+            if (TYPE_MAX != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
                 high.__set_column_name(_column_name);
                 high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<"));
                 high.condition_values.push_back(cast_to_string(_high_value));
@@ -137,20 +145,30 @@ public:
 
     void clear() {
         _fixed_values.clear();
-        _low_value = _type_min;
-        _high_value = _type_max;
+        _low_value = TYPE_MIN;
+        _high_value = TYPE_MAX;
         _low_op = FILTER_LARGER_OR_EQUAL;
         _high_op = FILTER_LESS_OR_EQUAL;
     }
 
+    bool is_whole_range() const {
+        return _fixed_values.empty() && _low_value == TYPE_MIN && _high_value == TYPE_MAX &&
+            _low_op == FILTER_LARGER_OR_EQUAL && _high_op == FILTER_LESS_OR_EQUAL;
+    }
+
+    static ColumnValueRange<T> create_empty_column_value_range(PrimitiveType type) {
+        return ColumnValueRange<T>("", type, type_limit<T>::max(), type_limit<T>::min());
+    }
+
 protected:
     bool is_in_range(const T& value);
 
 private:
+    const static T TYPE_MIN;                // Column type's min value
+    const static T TYPE_MAX;                // Column type's max value
+
     std::string _column_name;
     PrimitiveType _column_type; // Column type (eg: TINYINT,SMALLINT,INT,BIGINT)
-    T _type_min;                // Column type's min value
-    T _type_max;                // Column type's max value
     T _low_value;               // Column's low value, closed interval at left
     T _high_value;              // Column's high value, open interval at right
     SQLFilterOp _low_op;
@@ -223,14 +241,21 @@ typedef boost::variant<ColumnValueRange<int8_t>, ColumnValueRange<int16_t>,
         ColumnValueRangeType;
 
 template <class T>
+const T ColumnValueRange<T>::TYPE_MIN = type_limit<T>::min();
+template <class T>
+const T ColumnValueRange<T>::TYPE_MAX = type_limit<T>::max();
+
+template <class T>
 ColumnValueRange<T>::ColumnValueRange() : _column_type(INVALID_TYPE) {}
 
 template <class T>
-ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max)
-        : _column_name(col_name),
+ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type)
+        : ColumnValueRange(std::move(col_name), type, TYPE_MIN, TYPE_MAX){}
+
+template <class T>
+ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max)
+        : _column_name(std::move(col_name)),
           _column_type(type),
-          _type_min(min),
-          _type_max(max),
           _low_value(min),
           _high_value(max),
           _low_op(FILTER_LARGER_OR_EQUAL),
@@ -325,14 +350,12 @@ void ColumnValueRange<T>::convert_to_fixed_value() {
         ++_low_value;
     }
 
-    if (_high_op == FILTER_LESS) {
-        for (T v = _low_value; v < _high_value; ++v) {
-            _fixed_values.insert(v);
-        }
-    } else {
-        for (T v = _low_value; v <= _high_value; ++v) {
-            _fixed_values.insert(v);
-        }
+    for (T v = _low_value; v < _high_value; ++v) {
+        _fixed_values.insert(v);
+    }
+
+    if (_high_op == FILTER_LESS_OR_EQUAL) {
+        _fixed_values.insert(_high_value);
     }
 }
 
@@ -391,8 +414,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
         }
         }
 
-        _high_value = _type_min;
-        _low_value = _type_max;
+        _high_value = TYPE_MIN;
+        _low_value = TYPE_MAX;
     } else {
         if (_high_value > _low_value) {
             switch (op) {
@@ -430,7 +453,6 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
                 }
 
                 break;
-                break;
             }
 
             default: {
@@ -442,8 +464,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
         if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == _high_op &&
             _high_value == _low_value) {
             add_fixed_value(_high_value);
-            _high_value = _type_min;
-            _low_value = _type_max;
+            _high_value = TYPE_MIN;
+            _low_value = TYPE_MAX;
         }
     }
 
@@ -496,6 +518,55 @@ bool ColumnValueRange<T>::is_in_range(const T& value) {
 }
 
 template <class T>
+void ColumnValueRange<T>::intersection(ColumnValueRange<T>& range) {
+    // 1. clear if column type not match
+    if (_column_type != range._column_type) {
+        set_empty_value_range();
+    }
+
+    // 2. clear if any range is empty
+    if (is_empty_value_range() || range.is_empty_value_range()) {
+        set_empty_value_range();
+    }
+
+    std::set<T> result_values;
+    // 3. fixed_value intersection
+    if (is_fixed_value_range() || range.is_fixed_value_range()) {
+        if (is_fixed_value_range() && range.is_fixed_value_range()) {
+            set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(),
+                             range._fixed_values.end(),
+                             std::inserter(result_values, result_values.begin()));
+        } else if (is_fixed_value_range() && !range.is_fixed_value_range()) {
+            iterator_type iter = _fixed_values.begin();
+
+            while (iter != _fixed_values.end()) {
+                if (range.is_in_range(*iter)) {
+                    result_values.insert(*iter);
+                }
+                ++iter;
+            }
+        } else if (!is_fixed_value_range() && range.is_fixed_value_range()) {
+            iterator_type iter = range._fixed_values.begin();
+            while (iter != range._fixed_values.end()) {
+                if (this->is_in_range(*iter)) {
+                    result_values.insert(*iter);
+                }
+                ++iter;
+            }
+        }
+
+        if (!result_values.empty()) {
+            _fixed_values = std::move(result_values);
+        } else {
+            set_empty_value_range();
+        }
+    } else {
+        add_range(range._high_op, range._high_value);
+        add_range(range._low_op, range._low_value);
+    }
+}
+
+template <class T>
 bool ColumnValueRange<T>::has_intersection(ColumnValueRange<T>& range) {
     // 1. return false if column type not match
     if (_column_type != range._column_type) {
diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index ce79fb5..445f88c 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -30,7 +30,6 @@
 #include "common/resource_tls.h"
 #include "exprs/binary_predicate.h"
 #include "exprs/expr.h"
-#include "exprs/in_predicate.h"
 #include "gen_cpp/PlanNodes_types.h"
 #include "runtime/exec_env.h"
 #include "runtime/row_batch.h"
@@ -455,45 +454,37 @@ Status OlapScanNode::normalize_conjuncts() {
 
     for (int slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
         switch (slots[slot_idx]->type().type) {
-            // TYPE_TINYINT use int32_t to present
-            // because it's easy to convert to string for build Olap fetch Query
         case TYPE_TINYINT: {
-            ColumnValueRange<int32_t> range(
-                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                    std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+            ColumnValueRange<int8_t> range(
+                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_SMALLINT: {
             ColumnValueRange<int16_t> range(
-                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                    std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());
+                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_INT: {
             ColumnValueRange<int32_t> range(
-                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                    std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
+                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_BIGINT: {
             ColumnValueRange<int64_t> range(
-                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                    std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
+                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_LARGEINT: {
-            __int128 min = MIN_INT128;
-            __int128 max = MAX_INT128;
             ColumnValueRange<__int128> range(slots[slot_idx]->col_name(),
-                                             slots[slot_idx]->type().type, min, max);
+                                             slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
@@ -501,47 +492,36 @@ Status OlapScanNode::normalize_conjuncts() {
         case TYPE_CHAR:
         case TYPE_VARCHAR:
         case TYPE_HLL: {
-            static char min_char = 0x00;
-            static char max_char = 0xff;
             ColumnValueRange<StringValue> range(
-                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                    StringValue(&min_char, 0), StringValue(&max_char, 1));
+                    slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_DATE:
         case TYPE_DATETIME: {
-            DateTimeValue max_value = DateTimeValue::datetime_max_value();
-            DateTimeValue min_value = DateTimeValue::datetime_min_value();
             ColumnValueRange<DateTimeValue> range(slots[slot_idx]->col_name(),
-                                                  slots[slot_idx]->type().type, min_value,
-                                                  max_value);
+                                                  slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_DECIMAL: {
-            DecimalValue min = DecimalValue::get_min_decimal();
-            DecimalValue max = DecimalValue::get_max_decimal();
             ColumnValueRange<DecimalValue> range(slots[slot_idx]->col_name(),
-                                                 slots[slot_idx]->type().type, min, max);
+                                                 slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_DECIMALV2: {
-            DecimalV2Value min = DecimalV2Value::get_min_decimal();
-            DecimalV2Value max = DecimalV2Value::get_max_decimal();
             ColumnValueRange<DecimalV2Value> range(slots[slot_idx]->col_name(),
-                                                   slots[slot_idx]->type().type, min, max);
+                                                   slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
 
         case TYPE_BOOLEAN: {
-            ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
-                                         false, true);
+            ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
             normalize_predicate(range, slots[slot_idx]);
             break;
         }
@@ -759,7 +739,10 @@ Status OlapScanNode::normalize_predicate(ColumnValueRange<T>& range, SlotDescrip
     // 2. Normalize BinaryPredicate , add to ColumnValueRange
     RETURN_IF_ERROR(normalize_noneq_binary_predicate(slot, &range));
 
-    // 3. Add range to Column->ColumnValueRange map
+    // 3. Check whether range is empty, set _eos
+    if (range.is_empty_value_range()) _eos = true;
+
+    // 4. Add range to Column->ColumnValueRange map
     _column_value_ranges[slot->col_name()] = range;
 
     return Status::OK();
@@ -775,6 +758,142 @@ static bool ignore_cast(SlotDescriptor* slot, Expr* expr) {
     return false;
 }
 
+
+bool OlapScanNode::should_push_down_in_predicate(doris::SlotDescriptor *slot, doris::InPredicate* pred) {
+    if (pred->is_not_in()) {
+        // can not push down NOT IN predicate to storage engine
+        return false;
+    }
+
+    if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
+        // not a slot ref(column)
+        return false;
+    }
+
+    std::vector<SlotId> slot_ids;
+    if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
+        // not a single column predicate
+        return false;
+    }
+
+    if (slot_ids[0] != slot->id()) {
+        // predicate not related to current column
+        return false;
+    }
+
+    if (pred->get_child(0)->type().type != slot->type().type) {
+        if (!ignore_cast(slot, pred->get_child(0))) {
+            // the type of predicate not match the slot's type
+            return false;
+        }
+    }
+
+    VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
+
+    // if there are too many elements in InPredicate, exceed the limit,
+    // we will not push any condition of this column to storage engine.
+    // because too many conditions pushed down to storage engine may even
+    // slow down the query process.
+    // ATTN: This is just an experience value. You may need to try
+    // different thresholds to improve performance.
+    if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
+        VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
+                << _max_pushdown_conditions_per_column;
+        return false;
+    }
+
+    return true;
+}
+
+std::pair<bool, void*> OlapScanNode::should_push_down_eq_predicate(doris::SlotDescriptor *slot, doris::Expr *pred,
+        int conj_idx, int child_idx) {
+    auto result_pair = std::make_pair<bool, void*>(false, nullptr);
+
+    // Do not get slot_ref of column, should not push_down to Storage Engine
+    if (Expr::type_without_cast(pred->get_child(child_idx)) !=
+        TExprNodeType::SLOT_REF) {
+        return result_pair;
+    }
+
+    std::vector<SlotId> slot_ids;
+    if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
+        // not a single column predicate
+        return result_pair;
+    }
+
+    if (slot_ids[0] != slot->id()) {
+        // predicate not related to current column
+        return result_pair;
+    }
+
+    if (pred->get_child(child_idx)->type().type != slot->type().type) {
+        if (!ignore_cast(slot, pred->get_child(child_idx))) {
+            // the type of predicate not match the slot's type
+            return result_pair;
+        }
+    }
+
+    Expr* expr = pred->get_child(1 - child_idx);
+    if (!expr->is_constant()) {
+        // only handle constant value
+        return result_pair;
+    }
+
+    // get value in result pair
+    result_pair.second = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
+    // TODO(lhp) push down is null predicate to storage engine
+    // for case: where col = null
+    if (result_pair.second != nullptr) {
+        result_pair.first = true;
+    }
+    return result_pair;
+}
+
+template <typename T>
+Status OlapScanNode::insert_value_to_range(doris::ColumnValueRange<T>& temp_range, doris::PrimitiveType type, void *value) {
+    switch (type) {
+        case TYPE_TINYINT: {
+            int32_t v = *reinterpret_cast<int8_t*>(value);
+            temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
+            break;
+        }
+        case TYPE_DATE: {
+            DateTimeValue date_value =
+                    *reinterpret_cast<DateTimeValue*>(value);
+            // There is must return empty data in olap_scan_node,
+            // Because data value loss accuracy
+            if (!date_value.check_loss_accuracy_cast_to_date()) {
+                temp_range.add_fixed_value(*reinterpret_cast<T*>(&date_value));
+            }
+            break;
+        }
+        case TYPE_DECIMAL:
+        case TYPE_DECIMALV2:
+        case TYPE_CHAR:
+        case TYPE_VARCHAR:
+        case TYPE_HLL:
+        case TYPE_DATETIME:
+        case TYPE_SMALLINT:
+        case TYPE_INT:
+        case TYPE_BIGINT:
+        case TYPE_LARGEINT: {
+            temp_range.add_fixed_value(*reinterpret_cast<T*>(value));
+            break;
+        }
+        case TYPE_BOOLEAN: {
+            bool v = *reinterpret_cast<bool*>(value);
+            temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
+            break;
+        }
+        default: {
+            LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
+                         << type << "]";
+            return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
+        }
+    }
+    return Status::OK();
+}
+
 // Construct the ColumnValueRange for one specified column
 // It will only handle the InPredicate and eq BinaryPredicate in _conjunct_ctxs.
 // It will try to push down conditions of that column as much as possible,
@@ -783,117 +902,38 @@ template <class T>
 Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
                                                    ColumnValueRange<T>* range) {
     std::vector<uint32_t> filter_conjuncts_index;
-    bool meet_eq_binary = false;
     for (int conj_idx = 0; conj_idx < _conjunct_ctxs.size(); ++conj_idx) {
+        // create empty range as temp range, temp range should do intersection on range
+        auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(range->type());
+
         // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
         if (TExprOpcode::FILTER_IN == _conjunct_ctxs[conj_idx]->root()->op()) {
             InPredicate* pred = dynamic_cast<InPredicate*>(_conjunct_ctxs[conj_idx]->root());
-            if (pred->is_not_in()) {
-                // can not push down NOT IN predicate to storage engine
-                continue;
-            }
-
-            if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
-                // not a slot ref(column)
-                continue;
-            }
-
-            std::vector<SlotId> slot_ids;
-            if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
-                // not a single column predicate
-                continue;
-            }
-
-            if (slot_ids[0] != slot->id()) {
-                // predicate not related to current column
-                continue;
-            }
-
-            if (pred->get_child(0)->type().type != slot->type().type) {
-                if (!ignore_cast(slot, pred->get_child(0))) {
-                    // the type of predicate not match the slot's type
-                    continue;
-                }
-            }
-
-            VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
-
-            // if there are too many elements in InPredicate, exceed the limit,
-            // we will not push any condition of this column to storage engine.
-            // because too many conditions pushed down to storage engine may even
-            // slow down the query process.
-            // ATTN: This is just an experience value. You may need to try
-            // different thresholds to improve performance.
-            if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
-                VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
-                        << _max_pushdown_conditions_per_column;
+            if (!should_push_down_in_predicate(slot, pred)) {
                 continue;
             }
 
             // begin to push InPredicate value into ColumnValueRange
             HybridSetBase::IteratorBase* iter = pred->hybrid_set()->begin();
-            auto skip_invalid_value_count = 0;
-
             while (iter->has_next()) {
                 // column in (NULL,...) couldn't push down to StorageEngine
-                // so that discard whole ColumnValueRange
+                // so set clear() temp_range to whole range
                 if (NULL == iter->get_value()) {
-                    range->clear();
-                    break;
-                }
-
-                switch (slot->type().type) {
-                case TYPE_TINYINT: {
-                    int32_t v = *reinterpret_cast<int8_t*>(const_cast<void*>(iter->get_value()));
-                    range->add_fixed_value(*reinterpret_cast<T*>(&v));
+                    temp_range.clear();
                     break;
                 }
-                case TYPE_DATE: {
-                    DateTimeValue date_value =
-                        *reinterpret_cast<const DateTimeValue*>(iter->get_value());
-                    if (date_value.check_loss_accuracy_cast_to_date()) {
-                        // There is may return empty data in olap_scan_node,
-                        // Because data value loss accuracy, skip this value
-                        skip_invalid_value_count++;
-                    } else {
-                        range->add_fixed_value(*reinterpret_cast<T *>(&date_value));
-                    }
-                    break;
-                }
-                case TYPE_DECIMAL:
-                case TYPE_DECIMALV2:
-                case TYPE_LARGEINT:
-                case TYPE_CHAR:
-                case TYPE_VARCHAR:
-                case TYPE_HLL:
-                case TYPE_SMALLINT:
-                case TYPE_INT:
-                case TYPE_BIGINT:
-                case TYPE_DATETIME: {
-                    range->add_fixed_value(
-                            *reinterpret_cast<T*>(const_cast<void*>(iter->get_value())));
-                    break;
-                }
-                case TYPE_BOOLEAN: {
-                    bool v = *reinterpret_cast<bool*>(const_cast<void*>(iter->get_value()));
-                    range->add_fixed_value(*reinterpret_cast<T*>(&v));
-                    break;
-                }
-                default: {
-                    break;
-                }
-                }
+                auto value = const_cast<void*>(iter->get_value());
+                RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
                 iter->next();
             }
 
-            // all value in hybrid set in skip, means all in condition
-            // is invalid, so set eos = true
-            if (skip_invalid_value_count == pred->hybrid_set()->size()) {
-                _eos = true;
-            }
-
-            if (is_key_column(slot->col_name())) {
-                filter_conjuncts_index.emplace_back(conj_idx);
+            // only where a in ('a', 'b', NULL) contain NULL will
+            // clear temp_range to whole range, no need do intersection
+            if (!temp_range.is_whole_range()) {
+               if (is_key_column(slot->col_name())) {
+                   filter_conjuncts_index.emplace_back(conj_idx);
+               }
+               range->intersection(temp_range);
             }
         } // end of handle in predicate
 
@@ -904,119 +944,27 @@ Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
             DCHECK(pred->get_num_children() == 2);
 
             for (int child_idx = 0; child_idx < 2; ++child_idx) {
-                if (Expr::type_without_cast(pred->get_child(child_idx)) !=
-                    TExprNodeType::SLOT_REF) {
-                    continue;
-                }
-
-                std::vector<SlotId> slot_ids;
-                if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
-                    // not a single column predicate
-                    continue;
-                }
-
-                if (slot_ids[0] != slot->id()) {
-                    // predicate not related to current column
-                    continue;
-                }
-
-                if (pred->get_child(child_idx)->type().type != slot->type().type) {
-                    if (!ignore_cast(slot, pred->get_child(child_idx))) {
-                        // the type of predicate not match the slot's type
-                        continue;
-                    }
-                }
-
-                Expr* expr = pred->get_child(1 - child_idx);
-                if (!expr->is_constant()) {
-                    // only handle constant value
-                    continue;
-                }
-
-                void* value = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
-                // for case: where col = null
-                if (value == NULL) {
+                // TODO: should use C++17 structured bindlings to refactor this code in the future:
+                // 'auto [should_push_down, value] = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);'
+                // make code tidier and readabler
+                auto result_pair = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);
+                if (!result_pair.first) {
                     continue;
                 }
+                auto value = result_pair.second;
 
-                // begin to push condition value into ColumnValueRange
-                // clear the ColumnValueRange before adding new fixed values.
-                // because for AND compound predicates, it can overwrite previous conditions
-                range->clear();
-                switch (slot->type().type) {
-                    case TYPE_TINYINT: {
-                        int32_t v = *reinterpret_cast<int8_t*>(value);
-                        range->add_fixed_value(*reinterpret_cast<T*>(&v));
-                        break;
-                    }
-                    case TYPE_DATE: {
-                        DateTimeValue date_value =
-                            *reinterpret_cast<DateTimeValue*>(value);
-                        if (date_value.check_loss_accuracy_cast_to_date()) {
-                           // There is must return empty data in olap_scan_node,
-                           // Because data value loss accuracy
-                           _eos = true;
-                        }
-                        range->add_fixed_value(*reinterpret_cast<T*>(&date_value));
-                        break;
-                    }
-                    case TYPE_DECIMAL:
-                    case TYPE_DECIMALV2:
-                    case TYPE_CHAR:
-                    case TYPE_VARCHAR:
-                    case TYPE_HLL:
-                    case TYPE_DATETIME:
-                    case TYPE_SMALLINT:
-                    case TYPE_INT:
-                    case TYPE_BIGINT:
-                    case TYPE_LARGEINT: {
-                        range->add_fixed_value(*reinterpret_cast<T*>(value));
-                        break;
-                    }
-                    case TYPE_BOOLEAN: {
-                        bool v = *reinterpret_cast<bool*>(value);
-                        range->add_fixed_value(*reinterpret_cast<T*>(&v));
-                        break;
-                    }
-                    default: {
-                        LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
-                            << expr->type() << "]";
-                        return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
-                    }
-                }
+                RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
 
                 if (is_key_column(slot->col_name())) {
                     filter_conjuncts_index.emplace_back(conj_idx);
                 }
-                meet_eq_binary = true;
+                range->intersection(temp_range);
             } // end for each binary predicate child
-        }     // end of handling eq binary predicate
-
-
-        if (range->get_fixed_value_size() > 0) {
-            // this columns already meet some eq predicates(IN or Binary),
-            // There is no need to continue to iterate.
-            // TODO(cmy): In fact, this part of the judgment should be completed in
-            // the FE query planning stage. For the following predicate conditions,
-            // it should be possible to eliminate at the FE side.
-            //      WHERE A = 1 and A in (2,3,4)
-
-            if (meet_eq_binary) {
-                // meet_eq_binary is true, means we meet at least one eq binary predicate.
-                // this flag is to handle following case:
-                // There are 2 conjuncts, first in a InPredicate, and second is a BinaryPredicate.
-                // Firstly, we met a InPredicate, and add lots of values in ColumnValueRange,
-                // if breaks, doris will read many rows filtered by these values.
-                // But if continue to handle the BinaryPredicate, the value in ColumnValueRange
-                // may become only one, which can reduce the rows read from storage engine.
-                // So the strategy is to use the BinaryPredicate as much as possible.
-                break;
-            }
-        }
+        } // end of handling eq binary predicate
     }
 
+    // exceed limit, no conditions will be pushed down to storage engine.
     if (range->get_fixed_value_size() > _max_pushdown_conditions_per_column) {
-        // exceed limit, no conditions will be pushed down to storage engine.
         range->clear();
     } else {
         std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
@@ -1158,8 +1106,6 @@ Status OlapScanNode::normalize_noneq_binary_predicate(SlotDescriptor* slot,
                         << " value: " << *reinterpret_cast<T*>(value);
             }
         }
-
-
     }
 
     std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index 0f5cbb6..80e0f68 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -26,6 +26,7 @@
 #include "exec/olap_common.h"
 #include "exec/olap_scanner.h"
 #include "exec/scan_node.h"
+#include "exprs/in_predicate.h"
 #include "runtime/descriptors.h"
 #include "runtime/row_batch_interface.hpp"
 #include "runtime/vectorized_row_batch.h"
@@ -172,6 +173,13 @@ private:
     void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot,
                                               const std::string& is_null_str);
 
+    bool should_push_down_in_predicate(SlotDescriptor* slot, InPredicate* in_pred);
+
+    std::pair<bool, void*> should_push_down_eq_predicate(SlotDescriptor* slot, Expr* pred, int conj_idx, int child_idx);
+
+    template <typename T>
+    static Status insert_value_to_range(ColumnValueRange<T>& range, PrimitiveType type, void* value);
+
     friend class OlapScanner;
 
     std::vector<TCondition> _is_null_vector;
diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h
index 7f5e432..3dc3881 100644
--- a/be/src/exec/olap_scanner.h
+++ b/be/src/exec/olap_scanner.h
@@ -26,7 +26,7 @@
 
 #include "common/status.h"
 #include "exec/exec_node.h"
-#include "exec/olap_common.h"
+#include "exec/olap_utils.h"
 #include "exprs/expr.h"
 #include "gen_cpp/PaloInternalService_types.h"
 #include "gen_cpp/PlanNodes_types.h"
diff --git a/be/src/runtime/datetime_value.cpp b/be/src/runtime/datetime_value.cpp
index 65e96f2..9b58370 100644
--- a/be/src/runtime/datetime_value.cpp
+++ b/be/src/runtime/datetime_value.cpp
@@ -59,8 +59,6 @@ static uint32_t calc_days_in_year(uint32_t year) {
     return is_leap(year) ? 366 : 365;
 }
 
-DateTimeValue DateTimeValue::_s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
-DateTimeValue DateTimeValue::_s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
 RE2 DateTimeValue::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$");
 
 bool DateTimeValue::check_range() const {
diff --git a/be/src/runtime/datetime_value.h b/be/src/runtime/datetime_value.h
index d092cc9..8751b93 100644
--- a/be/src/runtime/datetime_value.h
+++ b/be/src/runtime/datetime_value.h
@@ -426,9 +426,15 @@ public:
         return std::string(buf, end - buf);
     }
 
-    static DateTimeValue datetime_min_value() { return _s_min_datetime_value; }
-
-    static DateTimeValue datetime_max_value() { return _s_max_datetime_value; }
+    static DateTimeValue datetime_min_value() {
+        static DateTimeValue _s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
+        return _s_min_datetime_value;
+    }
+    
+    static DateTimeValue datetime_max_value() {
+        static DateTimeValue _s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
+        return _s_max_datetime_value;
+    }
 
     int64_t second_diff(const DateTimeValue& rhs) const {
         int day_diff = daynr() - rhs.daynr();
@@ -542,8 +548,6 @@ private:
               _day(day),
               _microsecond(microsecond) {}
 
-    static DateTimeValue _s_min_datetime_value;
-    static DateTimeValue _s_max_datetime_value;
     // RE2 obj is thread safe
     static RE2 time_zone_offset_format_reg;
 };
diff --git a/be/src/runtime/string_value.cpp b/be/src/runtime/string_value.cpp
index f4c15d3..6ae245b 100644
--- a/be/src/runtime/string_value.cpp
+++ b/be/src/runtime/string_value.cpp
@@ -39,4 +39,16 @@ std::size_t operator-(const StringValue& v1, const StringValue& v2) {
     return 0;
 }
 
+constexpr char StringValue::MIN_CHAR = 0x00;
+
+constexpr char StringValue::MAX_CHAR = 0xff;
+
+StringValue StringValue::min_string_val() {
+    return StringValue((char*)(&StringValue::MIN_CHAR), 0);
+}
+
+StringValue StringValue::max_string_val() {
+    return StringValue((char*)(&StringValue::MAX_CHAR), 1);
+}
+
 } // namespace doris
diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h
index 3f9e3a3..160ff17 100644
--- a/be/src/runtime/string_value.h
+++ b/be/src/runtime/string_value.h
@@ -29,6 +29,9 @@ namespace doris {
 // The returned StringValue of all functions that return StringValue
 // shares its buffer the parent.
 struct StringValue {
+    const static char MIN_CHAR;
+    const static char MAX_CHAR;
+
     static const int MAX_LENGTH = (1 << 30);
     // TODO: change ptr to an offset relative to a contiguous memory block,
     // so that we can send row batches between nodes without having to swizzle
@@ -104,6 +107,10 @@ struct StringValue {
     static StringValue from_string_val(const doris_udf::StringVal& sv) {
         return StringValue(reinterpret_cast<char*>(sv.ptr), sv.len);
     }
+
+    static StringValue min_string_val();
+
+    static StringValue max_string_val();
 };
 
 // This function must be called 'hash_value' to be picked up by boost.
diff --git a/be/src/runtime/type_limit.h b/be/src/runtime/type_limit.h
new file mode 100644
index 0000000..6224729
--- /dev/null
+++ b/be/src/runtime/type_limit.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_BE_RUNTIME_TYPE_LIMIT_H
+#define DORIS_BE_RUNTIME_TYPE_LIMIT_H
+
+#include "runtime/datetime_value.h"
+#include "runtime/decimal_value.h"
+#include "runtime/decimalv2_value.h"
+#include "runtime/string_value.h"
+
+namespace doris {
+
+template <typename T>
+struct type_limit {
+    static T min() {
+        return std::numeric_limits<T>::min();
+    }
+    static T max() {
+        return std::numeric_limits<T>::max();
+    }
+};
+
+template <>
+struct type_limit<StringValue> {
+    static StringValue min() {
+        return StringValue::min_string_val();
+    }
+    static StringValue max() {
+        return StringValue::max_string_val();
+    }
+};
+
+template <>
+struct type_limit<DecimalValue> {
+    static DecimalValue min() {
+        return DecimalValue::get_min_decimal();
+    }
+    static DecimalValue max() {
+        return DecimalValue::get_max_decimal();
+    }
+};
+
+template <>
+struct type_limit<DecimalV2Value> {
+    static DecimalV2Value min() {
+        return DecimalV2Value::get_min_decimal();
+    }
+    static DecimalV2Value max() {
+        return DecimalV2Value::get_max_decimal();
+    }
+};
+
+template <>
+struct type_limit<DateTimeValue> {
+    static DateTimeValue min() {
+        return DateTimeValue::datetime_min_value();
+    }
+    static DateTimeValue max() {
+        return DateTimeValue::datetime_max_value();
+    }
+};
+
+} // namespace doris
+
+#endif


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org