You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/12/15 01:29:18 UTC
[incubator-doris] branch master updated: [Bug] Fix the bug of where
condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0a0e46f [Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
0a0e46f is described below
commit 0a0e46fd5393c7abefd7b3efe80fec2dc4aa2caf
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Tue Dec 15 09:29:10 2020 +0800
[Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
And Refactor ColumnRangeValue and OlapScanNode
This patch mainly do the following:
- Fix issue #5071
- Change type_min in ColumnRangeValue as static
- Add Class of type_limit make code clear
- Refactor the function of normalize_in_and_eq_predicate
---
be/src/exec/olap_common.cpp | 5 +
be/src/exec/olap_common.h | 129 +++++++++---
be/src/exec/olap_scan_node.cpp | 406 +++++++++++++++++---------------------
be/src/exec/olap_scan_node.h | 8 +
be/src/exec/olap_scanner.h | 2 +-
be/src/runtime/datetime_value.cpp | 2 -
be/src/runtime/datetime_value.h | 14 +-
be/src/runtime/string_value.cpp | 12 ++
be/src/runtime/string_value.h | 7 +
be/src/runtime/type_limit.h | 80 ++++++++
10 files changed, 398 insertions(+), 267 deletions(-)
diff --git a/be/src/exec/olap_common.cpp b/be/src/exec/olap_common.cpp
index 6d51fb6..54c584e 100644
--- a/be/src/exec/olap_common.cpp
+++ b/be/src/exec/olap_common.cpp
@@ -36,6 +36,11 @@ std::string cast_to_string(__int128 value) {
}
template <>
+std::string cast_to_string(int8_t value) {
+ return std::to_string(static_cast<int>(value));
+}
+
+template <>
void ColumnValueRange<StringValue>::convert_to_fixed_value() {
return;
}
diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index b4cf31e..c952f7b 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -31,7 +31,7 @@
#include "exec/scan_node.h"
#include "gen_cpp/PlanNodes_types.h"
#include "olap/tuple.h"
-#include "runtime/datetime_value.h"
+#include "runtime/type_limit.h"
#include "runtime/descriptors.h"
#include "runtime/string_value.hpp"
@@ -42,6 +42,11 @@ std::string cast_to_string(T value) {
return boost::lexical_cast<std::string>(value);
}
+// TYPE_TINYINT should cast to int32_t to first
+// because it need to convert to num not char for build Olap fetch Query
+template <>
+std::string cast_to_string(int8_t);
+
/**
* @brief Column's value range
**/
@@ -50,7 +55,8 @@ class ColumnValueRange {
public:
typedef typename std::set<T>::iterator iterator_type;
ColumnValueRange();
- ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max);
+ ColumnValueRange(std::string col_name, PrimitiveType type);
+ ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max);
// should add fixed value before add range
Status add_fixed_value(T value);
@@ -73,10 +79,12 @@ public:
bool has_intersection(ColumnValueRange<T>& range);
+ void intersection(ColumnValueRange<T>& range);
+
void set_empty_value_range() {
_fixed_values.clear();
- _low_value = _type_max;
- _high_value = _type_min;
+ _low_value = TYPE_MAX;
+ _high_value = TYPE_MIN;
}
const std::set<T>& get_fixed_value_set() const { return _fixed_values; }
@@ -85,9 +93,9 @@ public:
T get_range_min_value() const { return _low_value; }
- bool is_low_value_mininum() const { return _low_value == _type_min; }
+ bool is_low_value_mininum() const { return _low_value == TYPE_MIN; }
- bool is_high_value_maximum() const { return _high_value == _type_max; }
+ bool is_high_value_maximum() const { return _high_value == TYPE_MAX; }
bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; }
@@ -112,7 +120,7 @@ public:
}
} else {
TCondition low;
- if (_type_min != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
+ if (TYPE_MIN != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
low.__set_column_name(_column_name);
low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>"));
low.condition_values.push_back(cast_to_string(_low_value));
@@ -123,7 +131,7 @@ public:
}
TCondition high;
- if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
+ if (TYPE_MAX != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
high.__set_column_name(_column_name);
high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<"));
high.condition_values.push_back(cast_to_string(_high_value));
@@ -137,20 +145,30 @@ public:
void clear() {
_fixed_values.clear();
- _low_value = _type_min;
- _high_value = _type_max;
+ _low_value = TYPE_MIN;
+ _high_value = TYPE_MAX;
_low_op = FILTER_LARGER_OR_EQUAL;
_high_op = FILTER_LESS_OR_EQUAL;
}
+ bool is_whole_range() const {
+ return _fixed_values.empty() && _low_value == TYPE_MIN && _high_value == TYPE_MAX &&
+ _low_op == FILTER_LARGER_OR_EQUAL && _high_op == FILTER_LESS_OR_EQUAL;
+ }
+
+ static ColumnValueRange<T> create_empty_column_value_range(PrimitiveType type) {
+ return ColumnValueRange<T>("", type, type_limit<T>::max(), type_limit<T>::min());
+ }
+
protected:
bool is_in_range(const T& value);
private:
+ const static T TYPE_MIN; // Column type's min value
+ const static T TYPE_MAX; // Column type's max value
+
std::string _column_name;
PrimitiveType _column_type; // Column type (eg: TINYINT,SMALLINT,INT,BIGINT)
- T _type_min; // Column type's min value
- T _type_max; // Column type's max value
T _low_value; // Column's low value, closed interval at left
T _high_value; // Column's high value, open interval at right
SQLFilterOp _low_op;
@@ -223,14 +241,21 @@ typedef boost::variant<ColumnValueRange<int8_t>, ColumnValueRange<int16_t>,
ColumnValueRangeType;
template <class T>
+const T ColumnValueRange<T>::TYPE_MIN = type_limit<T>::min();
+template <class T>
+const T ColumnValueRange<T>::TYPE_MAX = type_limit<T>::max();
+
+template <class T>
ColumnValueRange<T>::ColumnValueRange() : _column_type(INVALID_TYPE) {}
template <class T>
-ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max)
- : _column_name(col_name),
+ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type)
+ : ColumnValueRange(std::move(col_name), type, TYPE_MIN, TYPE_MAX){}
+
+template <class T>
+ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max)
+ : _column_name(std::move(col_name)),
_column_type(type),
- _type_min(min),
- _type_max(max),
_low_value(min),
_high_value(max),
_low_op(FILTER_LARGER_OR_EQUAL),
@@ -325,14 +350,12 @@ void ColumnValueRange<T>::convert_to_fixed_value() {
++_low_value;
}
- if (_high_op == FILTER_LESS) {
- for (T v = _low_value; v < _high_value; ++v) {
- _fixed_values.insert(v);
- }
- } else {
- for (T v = _low_value; v <= _high_value; ++v) {
- _fixed_values.insert(v);
- }
+ for (T v = _low_value; v < _high_value; ++v) {
+ _fixed_values.insert(v);
+ }
+
+ if (_high_op == FILTER_LESS_OR_EQUAL) {
+ _fixed_values.insert(_high_value);
}
}
@@ -391,8 +414,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
}
}
- _high_value = _type_min;
- _low_value = _type_max;
+ _high_value = TYPE_MIN;
+ _low_value = TYPE_MAX;
} else {
if (_high_value > _low_value) {
switch (op) {
@@ -430,7 +453,6 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
}
break;
- break;
}
default: {
@@ -442,8 +464,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == _high_op &&
_high_value == _low_value) {
add_fixed_value(_high_value);
- _high_value = _type_min;
- _low_value = _type_max;
+ _high_value = TYPE_MIN;
+ _low_value = TYPE_MAX;
}
}
@@ -496,6 +518,55 @@ bool ColumnValueRange<T>::is_in_range(const T& value) {
}
template <class T>
+void ColumnValueRange<T>::intersection(ColumnValueRange<T>& range) {
+ // 1. clear if column type not match
+ if (_column_type != range._column_type) {
+ set_empty_value_range();
+ }
+
+ // 2. clear if any range is empty
+ if (is_empty_value_range() || range.is_empty_value_range()) {
+ set_empty_value_range();
+ }
+
+ std::set<T> result_values;
+ // 3. fixed_value intersection
+ if (is_fixed_value_range() || range.is_fixed_value_range()) {
+ if (is_fixed_value_range() && range.is_fixed_value_range()) {
+ set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(),
+ range._fixed_values.end(),
+ std::inserter(result_values, result_values.begin()));
+ } else if (is_fixed_value_range() && !range.is_fixed_value_range()) {
+ iterator_type iter = _fixed_values.begin();
+
+ while (iter != _fixed_values.end()) {
+ if (range.is_in_range(*iter)) {
+ result_values.insert(*iter);
+ }
+ ++iter;
+ }
+ } else if (!is_fixed_value_range() && range.is_fixed_value_range()) {
+ iterator_type iter = range._fixed_values.begin();
+ while (iter != range._fixed_values.end()) {
+ if (this->is_in_range(*iter)) {
+ result_values.insert(*iter);
+ }
+ ++iter;
+ }
+ }
+
+ if (!result_values.empty()) {
+ _fixed_values = std::move(result_values);
+ } else {
+ set_empty_value_range();
+ }
+ } else {
+ add_range(range._high_op, range._high_value);
+ add_range(range._low_op, range._low_value);
+ }
+}
+
+template <class T>
bool ColumnValueRange<T>::has_intersection(ColumnValueRange<T>& range) {
// 1. return false if column type not match
if (_column_type != range._column_type) {
diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index ce79fb5..445f88c 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -30,7 +30,6 @@
#include "common/resource_tls.h"
#include "exprs/binary_predicate.h"
#include "exprs/expr.h"
-#include "exprs/in_predicate.h"
#include "gen_cpp/PlanNodes_types.h"
#include "runtime/exec_env.h"
#include "runtime/row_batch.h"
@@ -455,45 +454,37 @@ Status OlapScanNode::normalize_conjuncts() {
for (int slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
switch (slots[slot_idx]->type().type) {
- // TYPE_TINYINT use int32_t to present
- // because it's easy to convert to string for build Olap fetch Query
case TYPE_TINYINT: {
- ColumnValueRange<int32_t> range(
- slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+ ColumnValueRange<int8_t> range(
+ slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_SMALLINT: {
ColumnValueRange<int16_t> range(
- slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());
+ slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_INT: {
ColumnValueRange<int32_t> range(
- slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
+ slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_BIGINT: {
ColumnValueRange<int64_t> range(
- slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
+ slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_LARGEINT: {
- __int128 min = MIN_INT128;
- __int128 max = MAX_INT128;
ColumnValueRange<__int128> range(slots[slot_idx]->col_name(),
- slots[slot_idx]->type().type, min, max);
+ slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
@@ -501,47 +492,36 @@ Status OlapScanNode::normalize_conjuncts() {
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL: {
- static char min_char = 0x00;
- static char max_char = 0xff;
ColumnValueRange<StringValue> range(
- slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- StringValue(&min_char, 0), StringValue(&max_char, 1));
+ slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DATE:
case TYPE_DATETIME: {
- DateTimeValue max_value = DateTimeValue::datetime_max_value();
- DateTimeValue min_value = DateTimeValue::datetime_min_value();
ColumnValueRange<DateTimeValue> range(slots[slot_idx]->col_name(),
- slots[slot_idx]->type().type, min_value,
- max_value);
+ slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DECIMAL: {
- DecimalValue min = DecimalValue::get_min_decimal();
- DecimalValue max = DecimalValue::get_max_decimal();
ColumnValueRange<DecimalValue> range(slots[slot_idx]->col_name(),
- slots[slot_idx]->type().type, min, max);
+ slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DECIMALV2: {
- DecimalV2Value min = DecimalV2Value::get_min_decimal();
- DecimalV2Value max = DecimalV2Value::get_max_decimal();
ColumnValueRange<DecimalV2Value> range(slots[slot_idx]->col_name(),
- slots[slot_idx]->type().type, min, max);
+ slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_BOOLEAN: {
- ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
- false, true);
+ ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
@@ -759,7 +739,10 @@ Status OlapScanNode::normalize_predicate(ColumnValueRange<T>& range, SlotDescrip
// 2. Normalize BinaryPredicate , add to ColumnValueRange
RETURN_IF_ERROR(normalize_noneq_binary_predicate(slot, &range));
- // 3. Add range to Column->ColumnValueRange map
+ // 3. Check whether range is empty, set _eos
+ if (range.is_empty_value_range()) _eos = true;
+
+ // 4. Add range to Column->ColumnValueRange map
_column_value_ranges[slot->col_name()] = range;
return Status::OK();
@@ -775,6 +758,142 @@ static bool ignore_cast(SlotDescriptor* slot, Expr* expr) {
return false;
}
+
+bool OlapScanNode::should_push_down_in_predicate(doris::SlotDescriptor *slot, doris::InPredicate* pred) {
+ if (pred->is_not_in()) {
+ // can not push down NOT IN predicate to storage engine
+ return false;
+ }
+
+ if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
+ // not a slot ref(column)
+ return false;
+ }
+
+ std::vector<SlotId> slot_ids;
+ if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
+ // not a single column predicate
+ return false;
+ }
+
+ if (slot_ids[0] != slot->id()) {
+ // predicate not related to current column
+ return false;
+ }
+
+ if (pred->get_child(0)->type().type != slot->type().type) {
+ if (!ignore_cast(slot, pred->get_child(0))) {
+ // the type of predicate not match the slot's type
+ return false;
+ }
+ }
+
+ VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
+
+ // if there are too many elements in InPredicate, exceed the limit,
+ // we will not push any condition of this column to storage engine.
+ // because too many conditions pushed down to storage engine may even
+ // slow down the query process.
+ // ATTN: This is just an experience value. You may need to try
+ // different thresholds to improve performance.
+ if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
+ VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
+ << _max_pushdown_conditions_per_column;
+ return false;
+ }
+
+ return true;
+}
+
+std::pair<bool, void*> OlapScanNode::should_push_down_eq_predicate(doris::SlotDescriptor *slot, doris::Expr *pred,
+ int conj_idx, int child_idx) {
+ auto result_pair = std::make_pair<bool, void*>(false, nullptr);
+
+ // Do not get slot_ref of column, should not push_down to Storage Engine
+ if (Expr::type_without_cast(pred->get_child(child_idx)) !=
+ TExprNodeType::SLOT_REF) {
+ return result_pair;
+ }
+
+ std::vector<SlotId> slot_ids;
+ if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
+ // not a single column predicate
+ return result_pair;
+ }
+
+ if (slot_ids[0] != slot->id()) {
+ // predicate not related to current column
+ return result_pair;
+ }
+
+ if (pred->get_child(child_idx)->type().type != slot->type().type) {
+ if (!ignore_cast(slot, pred->get_child(child_idx))) {
+ // the type of predicate not match the slot's type
+ return result_pair;
+ }
+ }
+
+ Expr* expr = pred->get_child(1 - child_idx);
+ if (!expr->is_constant()) {
+ // only handle constant value
+ return result_pair;
+ }
+
+ // get value in result pair
+ result_pair.second = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
+ // TODO(lhp) push down is null predicate to storage engine
+ // for case: where col = null
+ if (result_pair.second != nullptr) {
+ result_pair.first = true;
+ }
+ return result_pair;
+}
+
+template <typename T>
+Status OlapScanNode::insert_value_to_range(doris::ColumnValueRange<T>& temp_range, doris::PrimitiveType type, void *value) {
+ switch (type) {
+ case TYPE_TINYINT: {
+ int32_t v = *reinterpret_cast<int8_t*>(value);
+ temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
+ break;
+ }
+ case TYPE_DATE: {
+ DateTimeValue date_value =
+ *reinterpret_cast<DateTimeValue*>(value);
+ // There is must return empty data in olap_scan_node,
+ // Because data value loss accuracy
+ if (!date_value.check_loss_accuracy_cast_to_date()) {
+ temp_range.add_fixed_value(*reinterpret_cast<T*>(&date_value));
+ }
+ break;
+ }
+ case TYPE_DECIMAL:
+ case TYPE_DECIMALV2:
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ case TYPE_HLL:
+ case TYPE_DATETIME:
+ case TYPE_SMALLINT:
+ case TYPE_INT:
+ case TYPE_BIGINT:
+ case TYPE_LARGEINT: {
+ temp_range.add_fixed_value(*reinterpret_cast<T*>(value));
+ break;
+ }
+ case TYPE_BOOLEAN: {
+ bool v = *reinterpret_cast<bool*>(value);
+ temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
+ break;
+ }
+ default: {
+ LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
+ << type << "]";
+ return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
+ }
+ }
+ return Status::OK();
+}
+
// Construct the ColumnValueRange for one specified column
// It will only handle the InPredicate and eq BinaryPredicate in _conjunct_ctxs.
// It will try to push down conditions of that column as much as possible,
@@ -783,117 +902,38 @@ template <class T>
Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
ColumnValueRange<T>* range) {
std::vector<uint32_t> filter_conjuncts_index;
- bool meet_eq_binary = false;
for (int conj_idx = 0; conj_idx < _conjunct_ctxs.size(); ++conj_idx) {
+ // create empty range as temp range, temp range should do intersection on range
+ auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(range->type());
+
// 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
if (TExprOpcode::FILTER_IN == _conjunct_ctxs[conj_idx]->root()->op()) {
InPredicate* pred = dynamic_cast<InPredicate*>(_conjunct_ctxs[conj_idx]->root());
- if (pred->is_not_in()) {
- // can not push down NOT IN predicate to storage engine
- continue;
- }
-
- if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
- // not a slot ref(column)
- continue;
- }
-
- std::vector<SlotId> slot_ids;
- if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
- // not a single column predicate
- continue;
- }
-
- if (slot_ids[0] != slot->id()) {
- // predicate not related to current column
- continue;
- }
-
- if (pred->get_child(0)->type().type != slot->type().type) {
- if (!ignore_cast(slot, pred->get_child(0))) {
- // the type of predicate not match the slot's type
- continue;
- }
- }
-
- VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
-
- // if there are too many elements in InPredicate, exceed the limit,
- // we will not push any condition of this column to storage engine.
- // because too many conditions pushed down to storage engine may even
- // slow down the query process.
- // ATTN: This is just an experience value. You may need to try
- // different thresholds to improve performance.
- if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
- VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
- << _max_pushdown_conditions_per_column;
+ if (!should_push_down_in_predicate(slot, pred)) {
continue;
}
// begin to push InPredicate value into ColumnValueRange
HybridSetBase::IteratorBase* iter = pred->hybrid_set()->begin();
- auto skip_invalid_value_count = 0;
-
while (iter->has_next()) {
// column in (NULL,...) couldn't push down to StorageEngine
- // so that discard whole ColumnValueRange
+ // so set clear() temp_range to whole range
if (NULL == iter->get_value()) {
- range->clear();
- break;
- }
-
- switch (slot->type().type) {
- case TYPE_TINYINT: {
- int32_t v = *reinterpret_cast<int8_t*>(const_cast<void*>(iter->get_value()));
- range->add_fixed_value(*reinterpret_cast<T*>(&v));
+ temp_range.clear();
break;
}
- case TYPE_DATE: {
- DateTimeValue date_value =
- *reinterpret_cast<const DateTimeValue*>(iter->get_value());
- if (date_value.check_loss_accuracy_cast_to_date()) {
- // There is may return empty data in olap_scan_node,
- // Because data value loss accuracy, skip this value
- skip_invalid_value_count++;
- } else {
- range->add_fixed_value(*reinterpret_cast<T *>(&date_value));
- }
- break;
- }
- case TYPE_DECIMAL:
- case TYPE_DECIMALV2:
- case TYPE_LARGEINT:
- case TYPE_CHAR:
- case TYPE_VARCHAR:
- case TYPE_HLL:
- case TYPE_SMALLINT:
- case TYPE_INT:
- case TYPE_BIGINT:
- case TYPE_DATETIME: {
- range->add_fixed_value(
- *reinterpret_cast<T*>(const_cast<void*>(iter->get_value())));
- break;
- }
- case TYPE_BOOLEAN: {
- bool v = *reinterpret_cast<bool*>(const_cast<void*>(iter->get_value()));
- range->add_fixed_value(*reinterpret_cast<T*>(&v));
- break;
- }
- default: {
- break;
- }
- }
+ auto value = const_cast<void*>(iter->get_value());
+ RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
iter->next();
}
- // all value in hybrid set in skip, means all in condition
- // is invalid, so set eos = true
- if (skip_invalid_value_count == pred->hybrid_set()->size()) {
- _eos = true;
- }
-
- if (is_key_column(slot->col_name())) {
- filter_conjuncts_index.emplace_back(conj_idx);
+ // only where a in ('a', 'b', NULL) contain NULL will
+ // clear temp_range to whole range, no need do intersection
+ if (!temp_range.is_whole_range()) {
+ if (is_key_column(slot->col_name())) {
+ filter_conjuncts_index.emplace_back(conj_idx);
+ }
+ range->intersection(temp_range);
}
} // end of handle in predicate
@@ -904,119 +944,27 @@ Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
DCHECK(pred->get_num_children() == 2);
for (int child_idx = 0; child_idx < 2; ++child_idx) {
- if (Expr::type_without_cast(pred->get_child(child_idx)) !=
- TExprNodeType::SLOT_REF) {
- continue;
- }
-
- std::vector<SlotId> slot_ids;
- if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
- // not a single column predicate
- continue;
- }
-
- if (slot_ids[0] != slot->id()) {
- // predicate not related to current column
- continue;
- }
-
- if (pred->get_child(child_idx)->type().type != slot->type().type) {
- if (!ignore_cast(slot, pred->get_child(child_idx))) {
- // the type of predicate not match the slot's type
- continue;
- }
- }
-
- Expr* expr = pred->get_child(1 - child_idx);
- if (!expr->is_constant()) {
- // only handle constant value
- continue;
- }
-
- void* value = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
- // for case: where col = null
- if (value == NULL) {
+ // TODO: should use C++17 structured bindlings to refactor this code in the future:
+ // 'auto [should_push_down, value] = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);'
+ // make code tidier and readabler
+ auto result_pair = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);
+ if (!result_pair.first) {
continue;
}
+ auto value = result_pair.second;
- // begin to push condition value into ColumnValueRange
- // clear the ColumnValueRange before adding new fixed values.
- // because for AND compound predicates, it can overwrite previous conditions
- range->clear();
- switch (slot->type().type) {
- case TYPE_TINYINT: {
- int32_t v = *reinterpret_cast<int8_t*>(value);
- range->add_fixed_value(*reinterpret_cast<T*>(&v));
- break;
- }
- case TYPE_DATE: {
- DateTimeValue date_value =
- *reinterpret_cast<DateTimeValue*>(value);
- if (date_value.check_loss_accuracy_cast_to_date()) {
- // There is must return empty data in olap_scan_node,
- // Because data value loss accuracy
- _eos = true;
- }
- range->add_fixed_value(*reinterpret_cast<T*>(&date_value));
- break;
- }
- case TYPE_DECIMAL:
- case TYPE_DECIMALV2:
- case TYPE_CHAR:
- case TYPE_VARCHAR:
- case TYPE_HLL:
- case TYPE_DATETIME:
- case TYPE_SMALLINT:
- case TYPE_INT:
- case TYPE_BIGINT:
- case TYPE_LARGEINT: {
- range->add_fixed_value(*reinterpret_cast<T*>(value));
- break;
- }
- case TYPE_BOOLEAN: {
- bool v = *reinterpret_cast<bool*>(value);
- range->add_fixed_value(*reinterpret_cast<T*>(&v));
- break;
- }
- default: {
- LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
- << expr->type() << "]";
- return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
- }
- }
+ RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
if (is_key_column(slot->col_name())) {
filter_conjuncts_index.emplace_back(conj_idx);
}
- meet_eq_binary = true;
+ range->intersection(temp_range);
} // end for each binary predicate child
- } // end of handling eq binary predicate
-
-
- if (range->get_fixed_value_size() > 0) {
- // this columns already meet some eq predicates(IN or Binary),
- // There is no need to continue to iterate.
- // TODO(cmy): In fact, this part of the judgment should be completed in
- // the FE query planning stage. For the following predicate conditions,
- // it should be possible to eliminate at the FE side.
- // WHERE A = 1 and A in (2,3,4)
-
- if (meet_eq_binary) {
- // meet_eq_binary is true, means we meet at least one eq binary predicate.
- // this flag is to handle following case:
- // There are 2 conjuncts, first in a InPredicate, and second is a BinaryPredicate.
- // Firstly, we met a InPredicate, and add lots of values in ColumnValueRange,
- // if breaks, doris will read many rows filtered by these values.
- // But if continue to handle the BinaryPredicate, the value in ColumnValueRange
- // may become only one, which can reduce the rows read from storage engine.
- // So the strategy is to use the BinaryPredicate as much as possible.
- break;
- }
- }
+ } // end of handling eq binary predicate
}
+ // exceed limit, no conditions will be pushed down to storage engine.
if (range->get_fixed_value_size() > _max_pushdown_conditions_per_column) {
- // exceed limit, no conditions will be pushed down to storage engine.
range->clear();
} else {
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
@@ -1158,8 +1106,6 @@ Status OlapScanNode::normalize_noneq_binary_predicate(SlotDescriptor* slot,
<< " value: " << *reinterpret_cast<T*>(value);
}
}
-
-
}
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index 0f5cbb6..80e0f68 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -26,6 +26,7 @@
#include "exec/olap_common.h"
#include "exec/olap_scanner.h"
#include "exec/scan_node.h"
+#include "exprs/in_predicate.h"
#include "runtime/descriptors.h"
#include "runtime/row_batch_interface.hpp"
#include "runtime/vectorized_row_batch.h"
@@ -172,6 +173,13 @@ private:
void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot,
const std::string& is_null_str);
+ bool should_push_down_in_predicate(SlotDescriptor* slot, InPredicate* in_pred);
+
+ std::pair<bool, void*> should_push_down_eq_predicate(SlotDescriptor* slot, Expr* pred, int conj_idx, int child_idx);
+
+ template <typename T>
+ static Status insert_value_to_range(ColumnValueRange<T>& range, PrimitiveType type, void* value);
+
friend class OlapScanner;
std::vector<TCondition> _is_null_vector;
diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h
index 7f5e432..3dc3881 100644
--- a/be/src/exec/olap_scanner.h
+++ b/be/src/exec/olap_scanner.h
@@ -26,7 +26,7 @@
#include "common/status.h"
#include "exec/exec_node.h"
-#include "exec/olap_common.h"
+#include "exec/olap_utils.h"
#include "exprs/expr.h"
#include "gen_cpp/PaloInternalService_types.h"
#include "gen_cpp/PlanNodes_types.h"
diff --git a/be/src/runtime/datetime_value.cpp b/be/src/runtime/datetime_value.cpp
index 65e96f2..9b58370 100644
--- a/be/src/runtime/datetime_value.cpp
+++ b/be/src/runtime/datetime_value.cpp
@@ -59,8 +59,6 @@ static uint32_t calc_days_in_year(uint32_t year) {
return is_leap(year) ? 366 : 365;
}
-DateTimeValue DateTimeValue::_s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
-DateTimeValue DateTimeValue::_s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
RE2 DateTimeValue::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$");
bool DateTimeValue::check_range() const {
diff --git a/be/src/runtime/datetime_value.h b/be/src/runtime/datetime_value.h
index d092cc9..8751b93 100644
--- a/be/src/runtime/datetime_value.h
+++ b/be/src/runtime/datetime_value.h
@@ -426,9 +426,15 @@ public:
return std::string(buf, end - buf);
}
- static DateTimeValue datetime_min_value() { return _s_min_datetime_value; }
-
- static DateTimeValue datetime_max_value() { return _s_max_datetime_value; }
+ static DateTimeValue datetime_min_value() {
+ static DateTimeValue _s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
+ return _s_min_datetime_value;
+ }
+
+ static DateTimeValue datetime_max_value() {
+ static DateTimeValue _s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
+ return _s_max_datetime_value;
+ }
int64_t second_diff(const DateTimeValue& rhs) const {
int day_diff = daynr() - rhs.daynr();
@@ -542,8 +548,6 @@ private:
_day(day),
_microsecond(microsecond) {}
- static DateTimeValue _s_min_datetime_value;
- static DateTimeValue _s_max_datetime_value;
// RE2 obj is thread safe
static RE2 time_zone_offset_format_reg;
};
diff --git a/be/src/runtime/string_value.cpp b/be/src/runtime/string_value.cpp
index f4c15d3..6ae245b 100644
--- a/be/src/runtime/string_value.cpp
+++ b/be/src/runtime/string_value.cpp
@@ -39,4 +39,16 @@ std::size_t operator-(const StringValue& v1, const StringValue& v2) {
return 0;
}
+constexpr char StringValue::MIN_CHAR = 0x00;
+
+constexpr char StringValue::MAX_CHAR = 0xff;
+
+StringValue StringValue::min_string_val() {
+ return StringValue((char*)(&StringValue::MIN_CHAR), 0);
+}
+
+StringValue StringValue::max_string_val() {
+ return StringValue((char*)(&StringValue::MAX_CHAR), 1);
+}
+
} // namespace doris
diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h
index 3f9e3a3..160ff17 100644
--- a/be/src/runtime/string_value.h
+++ b/be/src/runtime/string_value.h
@@ -29,6 +29,9 @@ namespace doris {
// The returned StringValue of all functions that return StringValue
// shares its buffer the parent.
struct StringValue {
+ const static char MIN_CHAR;
+ const static char MAX_CHAR;
+
static const int MAX_LENGTH = (1 << 30);
// TODO: change ptr to an offset relative to a contiguous memory block,
// so that we can send row batches between nodes without having to swizzle
@@ -104,6 +107,10 @@ struct StringValue {
static StringValue from_string_val(const doris_udf::StringVal& sv) {
return StringValue(reinterpret_cast<char*>(sv.ptr), sv.len);
}
+
+ static StringValue min_string_val();
+
+ static StringValue max_string_val();
};
// This function must be called 'hash_value' to be picked up by boost.
diff --git a/be/src/runtime/type_limit.h b/be/src/runtime/type_limit.h
new file mode 100644
index 0000000..6224729
--- /dev/null
+++ b/be/src/runtime/type_limit.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_BE_RUNTIME_TYPE_LIMIT_H
+#define DORIS_BE_RUNTIME_TYPE_LIMIT_H
+
+#include "runtime/datetime_value.h"
+#include "runtime/decimal_value.h"
+#include "runtime/decimalv2_value.h"
+#include "runtime/string_value.h"
+
+namespace doris {
+
+template <typename T>
+struct type_limit {
+ static T min() {
+ return std::numeric_limits<T>::min();
+ }
+ static T max() {
+ return std::numeric_limits<T>::max();
+ }
+};
+
+template <>
+struct type_limit<StringValue> {
+ static StringValue min() {
+ return StringValue::min_string_val();
+ }
+ static StringValue max() {
+ return StringValue::max_string_val();
+ }
+};
+
+template <>
+struct type_limit<DecimalValue> {
+ static DecimalValue min() {
+ return DecimalValue::get_min_decimal();
+ }
+ static DecimalValue max() {
+ return DecimalValue::get_max_decimal();
+ }
+};
+
+template <>
+struct type_limit<DecimalV2Value> {
+ static DecimalV2Value min() {
+ return DecimalV2Value::get_min_decimal();
+ }
+ static DecimalV2Value max() {
+ return DecimalV2Value::get_max_decimal();
+ }
+};
+
+template <>
+struct type_limit<DateTimeValue> {
+ static DateTimeValue min() {
+ return DateTimeValue::datetime_min_value();
+ }
+ static DateTimeValue max() {
+ return DateTimeValue::datetime_max_value();
+ }
+};
+
+} // namespace doris
+
+#endif
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org