You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/04/30 15:44:48 UTC

[doris] 01/03: [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.0-alpha
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 0a06f7372fe0e1ea9281b6a1e95570f7f6c74338
Author: yiguolei <67...@qq.com>
AuthorDate: Sat Apr 29 21:28:41 2023 +0800

    [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221)
    
    Sometimes the dict is not initialized when run comparison predicate here, for example, the full page is null, then the reader will skip read, so that the dictionary is not inited. The cached code is wrong during this case, because the following page maybe not null, and the dict should have items in the future.
    This will result the dict string column query return wrong result, if there are many null values in the column.
    I also add some regression test for dict column's equal query, larger than query, less than query.
    
    ---------
    
    Co-authored-by: yiguolei <yi...@gmail.com>
---
 be/src/common/config.h                             |   1 +
 be/src/olap/comparison_predicate.h                 |  20 +++-
 be/src/vec/columns/column_dictionary.h             |   4 +-
 .../data/nereids_p0/test_dict_with_null.out        | 112 +++++++++++++++++++++
 .../data/query_p0/test_dict_with_null.out          | 112 +++++++++++++++++++++
 .../suites/nereids_p0/test_dict_with_null.groovy   |   5 +-
 .../suites/query_p0/test_dict_with_null.groovy     |   5 +-
 7 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 65ce764be1..62651e9e51 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -284,6 +284,7 @@ CONF_Bool(disable_storage_page_cache, "false");
 CONF_Bool(disable_storage_row_cache, "true");
 
 CONF_Bool(enable_low_cardinality_optimize, "true");
+CONF_Bool(enable_low_cardinality_cache_code, "true");
 
 // be policy
 // whether check compaction checksum
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
index 0371b27282..1a7fa8baa7 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -564,8 +564,26 @@ private:
             const vectorized::ColumnDictI32& column) const {
         /// if _cache_code_enabled is false, always find the code from dict.
         if (UNLIKELY(!_cache_code_enabled || _cached_code == _InvalidateCodeValue)) {
-            _cached_code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq())
+            int32_t code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq())
                                        : column.find_code(_value);
+
+            // Protect the invalid code logic, to avoid data error.
+            if (code == _InvalidateCodeValue) {
+                LOG(FATAL) << "column dictionary should not return the code " << code
+                           << ", because it is assumed as an invalid code in comparison predicate";
+            }
+            // Sometimes the dict is not initialized when run comparison predicate here, for example,
+            // the full page is null, then the reader will skip read, so that the dictionary is not
+            // inited. The cached code is wrong during this case, because the following page maybe not
+            // null, and the dict should have items in the future.
+            //
+            // Cached code may have problems, so that add a config here, if not opened, then
+            // we will return the code and not cache it.
+            if (column.is_dict_empty() || !config::enable_low_cardinality_cache_code) {
+                return code;
+            }
+            // If the dict is not empty, then the dict is inited and we could cache the value.
+            _cached_code = code;
         }
         return _cached_code;
     }
diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h
index 8a2c055d13..0d3b4ee8de 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -306,6 +306,8 @@ public:
 
     bool is_dict_sorted() const { return _dict_sorted; }
 
+    bool is_dict_empty() const { return _dict.empty(); }
+
     bool is_dict_code_converted() const { return _dict_code_converted; }
 
     MutableColumnPtr convert_to_predicate_column_if_dictionary() override {
@@ -507,7 +509,7 @@ public:
 
         size_t byte_size() { return _dict_data->size() * sizeof((*_dict_data)[0]); }
 
-        bool empty() { return _dict_data->empty(); }
+        bool empty() const { return _dict_data->empty(); }
 
         size_t avg_str_len() { return empty() ? 0 : _total_str_len / _dict_data->size(); }
 
diff --git a/regression-test/data/nereids_p0/test_dict_with_null.out b/regression-test/data/nereids_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/nereids_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql1 --
+101	abc
+
+-- !sql2 --
+101	abc
+
+-- !sql3 --
+101	abc
+
+-- !sql4 --
+100	\N
+99	\N
+98	\N
+97	\N
+96	\N
+95	\N
+94	\N
+93	\N
+92	\N
+91	\N
+90	\N
+89	\N
+88	\N
+87	\N
+86	\N
+85	\N
+84	\N
+83	\N
+82	\N
+81	\N
+80	\N
+79	\N
+78	\N
+77	\N
+76	\N
+75	\N
+74	\N
+73	\N
+72	\N
+71	\N
+70	\N
+69	\N
+68	\N
+67	\N
+66	\N
+65	\N
+64	\N
+63	\N
+62	\N
+61	\N
+60	\N
+59	\N
+58	\N
+57	\N
+56	\N
+55	\N
+54	\N
+53	\N
+52	\N
+51	\N
+50	\N
+49	\N
+48	\N
+47	\N
+46	\N
+45	\N
+44	\N
+43	\N
+42	\N
+41	\N
+40	\N
+39	\N
+38	\N
+37	\N
+36	\N
+35	\N
+34	\N
+33	\N
+32	\N
+31	\N
+30	\N
+29	\N
+28	\N
+27	\N
+26	\N
+25	\N
+24	\N
+23	\N
+22	\N
+21	\N
+20	\N
+19	\N
+18	\N
+17	\N
+16	\N
+15	\N
+14	\N
+13	\N
+12	\N
+11	\N
+10	\N
+9	\N
+8	\N
+7	\N
+6	\N
+5	\N
+4	\N
+3	\N
+2	\N
+1	\N
+
diff --git a/regression-test/data/query_p0/test_dict_with_null.out b/regression-test/data/query_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/query_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql1 --
+101	abc
+
+-- !sql2 --
+101	abc
+
+-- !sql3 --
+101	abc
+
+-- !sql4 --
+100	\N
+99	\N
+98	\N
+97	\N
+96	\N
+95	\N
+94	\N
+93	\N
+92	\N
+91	\N
+90	\N
+89	\N
+88	\N
+87	\N
+86	\N
+85	\N
+84	\N
+83	\N
+82	\N
+81	\N
+80	\N
+79	\N
+78	\N
+77	\N
+76	\N
+75	\N
+74	\N
+73	\N
+72	\N
+71	\N
+70	\N
+69	\N
+68	\N
+67	\N
+66	\N
+65	\N
+64	\N
+63	\N
+62	\N
+61	\N
+60	\N
+59	\N
+58	\N
+57	\N
+56	\N
+55	\N
+54	\N
+53	\N
+52	\N
+51	\N
+50	\N
+49	\N
+48	\N
+47	\N
+46	\N
+45	\N
+44	\N
+43	\N
+42	\N
+41	\N
+40	\N
+39	\N
+38	\N
+37	\N
+36	\N
+35	\N
+34	\N
+33	\N
+32	\N
+31	\N
+30	\N
+29	\N
+28	\N
+27	\N
+26	\N
+25	\N
+24	\N
+23	\N
+22	\N
+21	\N
+20	\N
+19	\N
+18	\N
+17	\N
+16	\N
+15	\N
+14	\N
+13	\N
+12	\N
+11	\N
+10	\N
+9	\N
+8	\N
+7	\N
+6	\N
+5	\N
+4	\N
+3	\N
+2	\N
+1	\N
+
diff --git a/regression-test/suites/nereids_p0/test_dict_with_null.groovy b/regression-test/suites/nereids_p0/test_dict_with_null.groovy
index becceafdde..a0858c2122 100644
--- a/regression-test/suites/nereids_p0/test_dict_with_null.groovy
+++ b/regression-test/suites/nereids_p0/test_dict_with_null.groovy
@@ -44,5 +44,8 @@ suite("dict_with_null", "query") {
     insert_sql += ", (101, 'abc')"
 
     sql insert_sql
-    sql "select * from test_dict_with_null where c_string > '0'"
+    qt_sql1 "select * from test_dict_with_null where c_string > '0'"
+    qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'"
+    qt_sql3 "select * from test_dict_with_null where c_string = 'abc'"
+    qt_sql4 "select * from test_dict_with_null where c_string is null order by c_int desc"
 }
\ No newline at end of file
diff --git a/regression-test/suites/query_p0/test_dict_with_null.groovy b/regression-test/suites/query_p0/test_dict_with_null.groovy
index a5c84444ae..b3738bb68a 100644
--- a/regression-test/suites/query_p0/test_dict_with_null.groovy
+++ b/regression-test/suites/query_p0/test_dict_with_null.groovy
@@ -42,5 +42,8 @@ suite("dict_with_null", "query") {
     insert_sql += ", (101, 'abc')"
 
     sql insert_sql
-    sql "select * from test_dict_with_null where c_string > '0'"
+    qt_sql1 "select * from test_dict_with_null where c_string > '0'"
+    qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'"
+    qt_sql3 "select * from test_dict_with_null where c_string = 'abc'"
+    qt_sql4 "select * from test_dict_with_null where c_string is null order by c_int desc"
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org