You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/04/06 02:44:58 UTC

[incubator-doris] 12/19: [fix](storage) Fix query result error due to find code by bound (#8787)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch dev-1.0.1
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git

commit be2fd6fecff1c87dd989a4d9b084b45cf72d9e8d
Author: ZenoYang <co...@qq.com>
AuthorDate: Sun Apr 3 10:38:14 2022 +0800

    [fix](storage) Fix query result error due to find code by bound (#8787)
    
    Problem recurrence
    SSB single table `lineorder_flat`, the query SQL is as follows:
    ```sql
    SELECT
        sum(LO_REVENUE),
        (LO_ORDERDATE DIV 10000) AS year,
        P_BRAND
    FROM lineorder_flat
    WHERE P_BRAND >= 'MFGR#22211111' AND P_BRAND <= 'MFGR#22281111' AND S_REGION = 'ASIA' and (LO_ORDERDATE DIV 10000) = 1992
    GROUP BY
        year,
        P_BRAND
    ORDER BY
        year,
        P_BRAND;
    ```
    
    when `enable_low_cardinality_optimize=false`, query result:
    ```sql
    +-------------------+------+-----------+
    | sum(`LO_REVENUE`) | year | P_BRAND   |
    +-------------------+------+-----------+
    |       65423264312 | 1992 | MFGR#2222 |
    |       66936772687 | 1992 | MFGR#2223 |
    |       64047191934 | 1992 | MFGR#2224 |
    |       65744559138 | 1992 | MFGR#2225 |
    |       66993045668 | 1992 | MFGR#2226 |
    |       67411226147 | 1992 | MFGR#2227 |
    |       69390885970 | 1992 | MFGR#2228 |
    +-------------------+------+-----------+
    ```
    
    when `enable_low_cardinality_optimize=true`, query result:
    ```sql
    +-------------------+------+-----------+
    | sum(`LO_REVENUE`) | year | P_BRAND   |
    +-------------------+------+-----------+
    |       66936772687 | 1992 | MFGR#2223 |
    |       64047191934 | 1992 | MFGR#2224 |
    |       65744559138 | 1992 | MFGR#2225 |
    |       66993045668 | 1992 | MFGR#2226 |
    |       67411226147 | 1992 | MFGR#2227 |
    |       69390885970 | 1992 | MFGR#2228 |
    +-------------------+------+-----------+
    ```
    
    One line less than the correct result.
    
    The reason is that 'MFGR#22211111' is not in the dictionary, so get the boundary code (`find_code_by_bound` method), but there is a bug here.
---
 be/src/olap/comparison_predicate.cpp   |  9 +++++----
 be/src/olap/in_list_predicate.cpp      |  3 +--
 be/src/vec/columns/column_dictionary.h | 34 +++++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp
index ef6ee3a01c..57549c349c 100644
--- a/be/src/olap/comparison_predicate.cpp
+++ b/be/src/olap/comparison_predicate.cpp
@@ -520,8 +520,7 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=)
                 auto& dict_col =                                                               \
                         reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(    \
                                 *col_ptr);                                                     \
-                auto code = dict_col.find_code(_value);                                        \
-                _dict_code = code;                                                             \
+                _dict_code = dict_col.find_code(_value);                                       \
                 _dict_code_inited = true;                                                      \
             }                                                                                  \
         }                                                                                      \
@@ -530,6 +529,9 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=)
 COMPARISON_PRED_SET_DICT_CODE(EqualPredicate)
 COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate)
 
+// If 1 OP 0 returns true, it means the predicate is > or >=
+// If 1 OP 1 returns true, it means the predicate is >= or <=
+// by this way, avoid redundant code
 #define RAMGE_COMPARISON_PRED_SET_DICT_CODE(CLASS, OP)                                         \
     template <class T>                                                                         \
     void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) {                   \
@@ -548,8 +550,7 @@ COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate)
                 auto& dict_col =                                                               \
                         reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(    \
                                 *col_ptr);                                                     \
-                auto code = dict_col.find_code_by_bound(_value, 0 OP 1, 1 OP 1);               \
-                _dict_code = code;                                                             \
+                _dict_code = dict_col.find_code_by_bound(_value, 1 OP 0, 1 OP 1);              \
                 _dict_code_inited = true;                                                      \
             }                                                                                  \
         }                                                                                      \
diff --git a/be/src/olap/in_list_predicate.cpp b/be/src/olap/in_list_predicate.cpp
index 21214f217a..0401673926 100644
--- a/be/src/olap/in_list_predicate.cpp
+++ b/be/src/olap/in_list_predicate.cpp
@@ -299,8 +299,7 @@ IN_LIST_PRED_BITMAP_EVALUATE(NotInListPredicate, -=)
                 auto& dict_col =                                                               \
                         reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(    \
                                 *col_ptr);                                                     \
-                auto code_set = dict_col.find_codes(_values);                                  \
-                _dict_codes = std::move(code_set);                                             \
+                _dict_codes = dict_col.find_codes(_values);                                    \
                 _dict_code_inited = true;                                                      \
             }                                                                                  \
         }                                                                                      \
diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h
index cca6c8594c..499a04abdf 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -244,8 +244,8 @@ public:
 
     int32_t find_code(const StringValue& value) const { return _dict.find_code(value); }
 
-    int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const {
-        return _dict.find_code_by_bound(value, lower, eq);
+    int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const {
+        return _dict.find_code_by_bound(value, greater, eq);
     }
 
     phmap::flat_hash_set<int32_t> find_codes(
@@ -294,19 +294,31 @@ public:
             return -1;
         }
 
-        inline int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const {
+        // For > , code takes upper_bound - 1; For >= , code takes upper_bound
+        // For < , code takes upper_bound; For <=, code takes upper_bound - 1
+        // For example a sorted dict: <'b',0> <'c',1> <'d',2>
+        // Now the predicate value is 'ccc', 'ccc' is not in the dict, 'ccc' is between 'c' and 'd'.
+        // std::upper_bound(..., 'ccc') - begin, will return the encoding of 'd', which is 2
+        // If the predicate is col > 'ccc' and the value of upper_bound-1 is 1,
+        //  then evaluate code > 1 and the result is 'd'.
+        // If the predicate is col < 'ccc' and the value of upper_bound is 2,
+        //  evaluate code < 2, and the return result is 'b'.
+        // If the predicate is col >= 'ccc' and the value of upper_bound is 2,
+        //  evaluate code >= 2, and the return result is 'd'.
+        // If the predicate is col <= 'ccc' and the value of upper_bound-1 is 1,
+        //  evaluate code <= 1, and the returned result is 'b'.
+        // If the predicate is col < 'a', 'a' is also not in the dict, and 'a' is less than 'b',
+        //  so upper_bound is the code 0 of b, then evaluate code < 0 and returns empty
+        // If the predicate is col <= 'a' and upper_bound-1 is -1,
+        //  then evaluate code <= -1 and returns empty
+        inline int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const {
             auto code = find_code(value);
             if (code >= 0) {
                 return code;
             }
-
-            if (lower) {
-                return std::lower_bound(_dict_data.begin(), _dict_data.end(), value) -
-                       _dict_data.begin() - eq;
-            } else {
-                return std::upper_bound(_dict_data.begin(), _dict_data.end(), value) -
-                       _dict_data.begin() + eq;
-            }
+            auto bound = std::upper_bound(_dict_data.begin(), _dict_data.end(), value) -
+                         _dict_data.begin();
+            return greater ? bound - greater + eq : bound - eq;
         }
 
         inline phmap::flat_hash_set<int32_t> find_codes(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org