You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/04/30 11:44:26 UTC

[doris] 15/15: [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 8d628eaf63a9d0f3ccd1af5828246f7401abd7f5
Author: yiguolei <67...@qq.com>
AuthorDate: Sat Apr 29 21:28:41 2023 +0800

    [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221)
    
    Sometimes the dict is not initialized when run comparison predicate here, for example, the full page is null, then the reader will skip read, so that the dictionary is not inited. The cached code is wrong during this case, because the following page maybe not null, and the dict should have items in the future.
    This will result the dict string column query return wrong result, if there are many null values in the column.
    I also add some regression test for dict column's equal query, larger than query, less than query.
    
    ---------
    
    Co-authored-by: yiguolei <yi...@gmail.com>
---
 be/src/common/config.h                             |   1 +
 be/src/olap/comparison_predicate.h                 |  20 +++-
 be/src/vec/columns/column_dictionary.h             |   4 +-
 .../data/nereids_p0/test_dict_with_null.out        | 112 +++++++++++++++++++++
 .../data/query_p0/test_dict_with_null.out          | 112 +++++++++++++++++++++
 .../suites/query_p0/test_dict_with_null.groovy     |   5 +-
 6 files changed, 251 insertions(+), 3 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 91e77f0a9c..eb663db838 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -257,6 +257,7 @@ CONF_Bool(disable_storage_page_cache, "true");
 CONF_Bool(enable_storage_vectorization, "true");
 
 CONF_Bool(enable_low_cardinality_optimize, "true");
+CONF_Bool(enable_low_cardinality_cache_code, "true");
 
 // be policy
 // whether check compaction checksum
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
index 6898eeb1c1..4e1327811a 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -632,8 +632,26 @@ private:
             const vectorized::ColumnDictI32& column) const {
         /// if _cache_code_enabled is false, always find the code from dict.
         if (UNLIKELY(!_cache_code_enabled || _cached_code == _InvalidateCodeValue)) {
-            _cached_code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq())
+            int32_t code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq())
                                        : column.find_code(_value);
+
+            // Protect the invalid code logic, to avoid data error.
+            if (code == _InvalidateCodeValue) {
+                LOG(FATAL) << "column dictionary should not return the code " << code
+                           << ", because it is assumed as an invalid code in comparison predicate";
+            }
+            // Sometimes the dict is not initialized when run comparison predicate here, for example,
+            // the full page is null, then the reader will skip read, so that the dictionary is not
+            // inited. The cached code is wrong during this case, because the following page maybe not
+            // null, and the dict should have items in the future.
+            //
+            // Cached code may have problems, so that add a config here, if not opened, then
+            // we will return the code and not cache it.
+            if (column.is_dict_empty() || !config::enable_low_cardinality_cache_code) {
+                return code;
+            }
+            // If the dict is not empty, then the dict is inited and we could cache the value.
+            _cached_code = code;
         }
         return _cached_code;
     }
diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h
index ca79748ce2..119e446426 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -286,6 +286,8 @@ public:
 
     bool is_dict_sorted() const { return _dict_sorted; }
 
+    bool is_dict_empty() const { return _dict.empty(); }
+
     bool is_dict_code_converted() const { return _dict_code_converted; }
 
     MutableColumnPtr convert_to_predicate_column_if_dictionary() override {
@@ -461,7 +463,7 @@ public:
 
         size_t byte_size() { return _dict_data->size() * sizeof((*_dict_data)[0]); }
 
-        bool empty() { return _dict_data->empty(); }
+        bool empty() const { return _dict_data->empty(); }
 
         size_t avg_str_len() { return empty() ? 0 : _total_str_len / _dict_data->size(); }
 
diff --git a/regression-test/data/nereids_p0/test_dict_with_null.out b/regression-test/data/nereids_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/nereids_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql1 --
+101	abc
+
+-- !sql2 --
+101	abc
+
+-- !sql3 --
+101	abc
+
+-- !sql4 --
+100	\N
+99	\N
+98	\N
+97	\N
+96	\N
+95	\N
+94	\N
+93	\N
+92	\N
+91	\N
+90	\N
+89	\N
+88	\N
+87	\N
+86	\N
+85	\N
+84	\N
+83	\N
+82	\N
+81	\N
+80	\N
+79	\N
+78	\N
+77	\N
+76	\N
+75	\N
+74	\N
+73	\N
+72	\N
+71	\N
+70	\N
+69	\N
+68	\N
+67	\N
+66	\N
+65	\N
+64	\N
+63	\N
+62	\N
+61	\N
+60	\N
+59	\N
+58	\N
+57	\N
+56	\N
+55	\N
+54	\N
+53	\N
+52	\N
+51	\N
+50	\N
+49	\N
+48	\N
+47	\N
+46	\N
+45	\N
+44	\N
+43	\N
+42	\N
+41	\N
+40	\N
+39	\N
+38	\N
+37	\N
+36	\N
+35	\N
+34	\N
+33	\N
+32	\N
+31	\N
+30	\N
+29	\N
+28	\N
+27	\N
+26	\N
+25	\N
+24	\N
+23	\N
+22	\N
+21	\N
+20	\N
+19	\N
+18	\N
+17	\N
+16	\N
+15	\N
+14	\N
+13	\N
+12	\N
+11	\N
+10	\N
+9	\N
+8	\N
+7	\N
+6	\N
+5	\N
+4	\N
+3	\N
+2	\N
+1	\N
+
diff --git a/regression-test/data/query_p0/test_dict_with_null.out b/regression-test/data/query_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/query_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql1 --
+101	abc
+
+-- !sql2 --
+101	abc
+
+-- !sql3 --
+101	abc
+
+-- !sql4 --
+100	\N
+99	\N
+98	\N
+97	\N
+96	\N
+95	\N
+94	\N
+93	\N
+92	\N
+91	\N
+90	\N
+89	\N
+88	\N
+87	\N
+86	\N
+85	\N
+84	\N
+83	\N
+82	\N
+81	\N
+80	\N
+79	\N
+78	\N
+77	\N
+76	\N
+75	\N
+74	\N
+73	\N
+72	\N
+71	\N
+70	\N
+69	\N
+68	\N
+67	\N
+66	\N
+65	\N
+64	\N
+63	\N
+62	\N
+61	\N
+60	\N
+59	\N
+58	\N
+57	\N
+56	\N
+55	\N
+54	\N
+53	\N
+52	\N
+51	\N
+50	\N
+49	\N
+48	\N
+47	\N
+46	\N
+45	\N
+44	\N
+43	\N
+42	\N
+41	\N
+40	\N
+39	\N
+38	\N
+37	\N
+36	\N
+35	\N
+34	\N
+33	\N
+32	\N
+31	\N
+30	\N
+29	\N
+28	\N
+27	\N
+26	\N
+25	\N
+24	\N
+23	\N
+22	\N
+21	\N
+20	\N
+19	\N
+18	\N
+17	\N
+16	\N
+15	\N
+14	\N
+13	\N
+12	\N
+11	\N
+10	\N
+9	\N
+8	\N
+7	\N
+6	\N
+5	\N
+4	\N
+3	\N
+2	\N
+1	\N
+
diff --git a/regression-test/suites/query_p0/test_dict_with_null.groovy b/regression-test/suites/query_p0/test_dict_with_null.groovy
index a5c84444ae..b3738bb68a 100644
--- a/regression-test/suites/query_p0/test_dict_with_null.groovy
+++ b/regression-test/suites/query_p0/test_dict_with_null.groovy
@@ -42,5 +42,8 @@ suite("dict_with_null", "query") {
     insert_sql += ", (101, 'abc')"
 
     sql insert_sql
-    sql "select * from test_dict_with_null where c_string > '0'"
+    qt_sql1 "select * from test_dict_with_null where c_string > '0'"
+    qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'"
+    qt_sql3 "select * from test_dict_with_null where c_string = 'abc'"
+    qt_sql4 "select * from test_dict_with_null where c_string is null order by c_int desc"
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org