You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/06/21 06:54:09 UTC
[doris] branch master updated: [Fix](orc-reader) Fix orc dict filter null value issue in `_convert_dict_cols_to_string_cols` which caused incorrect result. (#21047)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new bad22dd4e2 [Fix](orc-reader) Fix orc dict filter null value issue in `_convert_dict_cols_to_string_cols` which caused incorrect result. (#21047)
bad22dd4e2 is described below

commit bad22dd4e25a8e382d6ada5b161910aabce88bd5
Author: Qi Chen <ka...@gmail.com>
AuthorDate: Wed Jun 21 14:54:01 2023 +0800

    [Fix](orc-reader) Fix orc dict filter null value issue in `_convert_dict_cols_to_string_cols` which caused incorrect result. (#21047)
    
    Query results should not have empty values.
    ```
    use regresssion.multi_catalog;
    select commit_id from github_events_orc WHERE (event_type = 'CommitCommentEvent') AND commit_id != "" limit 10;
    ```
    ```
    +------------------------------------------+
    | commit_id                                |
    +------------------------------------------+
    | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 |
    | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 |
    | 4e3ab2ff2d2474f5d51334b9b0fdf17e9845a166 |
    |                                          |
    |                                          |
    |                                          |
    |                                          |
    |                                          |
    |                                          |
    | 7191c20cb49da07a7fc16aa32dc0de4faff528b2 |
    +------------------------------------------+
    10 rows in set (0.54 sec)
    ```
---
 be/src/vec/columns/column_string.h                 |  2 +
 be/src/vec/exec/format/orc/vorc_reader.cpp         | 47 ++++++++------
 be/src/vec/exec/format/orc/vorc_reader.h           |  1 +
 .../format/parquet/byte_array_dict_decoder.cpp     |  4 +-
 .../hive/test_external_github.out                  | 75 ++++++++++++++++++++++
 .../hive/test_external_github.groovy               |  9 +++
 6 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h
index eb27cdc4ed..efd90fd844 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -64,6 +64,8 @@ public:
     using Char = UInt8;
     using Chars = PaddedPODArray<UInt8>;
 
+    static constexpr size_t MAX_STRINGS_OVERFLOW_SIZE = 128;
+
     void static check_chars_length(size_t total_length, size_t element_number) {
         if (UNLIKELY(total_length > MAX_STRING_SIZE)) {
             throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE,
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 007aff91ca..8b1ccd8ccc 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -94,6 +94,7 @@ namespace doris::vectorized {
 
 // TODO: we need to determine it by test.
 static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits<uint32_t>::max();
+static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = "";
 
 #define FOR_FLAT_ORC_COLUMNS(M)                            \
     M(TypeIndex::Int8, Int8, orc::LongVectorBatch)         \
@@ -1039,7 +1040,6 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                                                      const orc::TypeKind& type_kind,
                                                      orc::EncodedStringVectorBatch* cvb,
                                                      size_t num_values) {
-    const static std::string empty_string;
     std::vector<StringRef> string_values;
     size_t max_value_length = 0;
     string_values.reserve(num_values);
@@ -1054,7 +1054,7 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                 if (cvb->notNull[i]) {
                     if constexpr (is_filter) {
                         if (!filter_data[i]) {
-                            string_values.emplace_back(empty_string.data(), 0);
+                            string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                             continue;
                         }
                     }
@@ -1070,14 +1070,14 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                     // Orc doesn't fill null values in new batch, but the former batch has been release.
                     // Other types like int/long/timestamp... are flat types without pointer in them,
                     // so other types do not need to be handled separately like string.
-                    string_values.emplace_back(empty_string.data(), 0);
+                    string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
             }
         } else {
             for (int i = 0; i < num_values; ++i) {
                 if constexpr (is_filter) {
                     if (!filter_data[i]) {
-                        string_values.emplace_back(empty_string.data(), 0);
+                        string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                         continue;
                     }
                 }
@@ -1097,23 +1097,26 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                 if (cvb->notNull[i]) {
                     if constexpr (is_filter) {
                         if (!filter_data[i]) {
-                            string_values.emplace_back(empty_string.data(), 0);
+                            string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                             continue;
                         }
                     }
                     char* val_ptr;
                     int64_t length;
                     cvb->dictionary->getValueByIndex(cvb->index.data()[i], val_ptr, length);
+                    if (length > max_value_length) {
+                        max_value_length = length;
+                    }
                     string_values.emplace_back(val_ptr, length);
                 } else {
-                    string_values.emplace_back(empty_string.data(), 0);
+                    string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
             }
         } else {
             for (int i = 0; i < num_values; ++i) {
                 if constexpr (is_filter) {
                     if (!filter_data[i]) {
-                        string_values.emplace_back(empty_string.data(), 0);
+                        string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                         continue;
                     }
                 }
@@ -1127,7 +1130,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
             }
         }
     }
-    data_column->insert_many_strings(&string_values[0], string_values.size());
+    data_column->insert_many_strings_overflow(&string_values[0], string_values.size(),
+                                              max_value_length);
     return Status::OK();
 }
 
@@ -1938,11 +1942,12 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
             const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
             const ColumnInt32* dict_column = assert_cast<const ColumnInt32*>(nested_column.get());
             DCHECK(dict_column);
+            const NullMap& null_map = nullable_column->get_null_map_data();
 
             MutableColumnPtr string_column;
             if (batch_vec != nullptr) {
                 string_column = _convert_dict_column_to_string_column(
-                        dict_column, (*batch_vec)[orc_col_idx->second],
+                        dict_column, &null_map, (*batch_vec)[orc_col_idx->second],
                         _col_orc_type[orc_col_idx->second]);
             } else {
                 string_column = ColumnString::create();
@@ -1958,7 +1963,7 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
             MutableColumnPtr string_column;
             if (batch_vec != nullptr) {
                 string_column = _convert_dict_column_to_string_column(
-                        dict_column, (*batch_vec)[orc_col_idx->second],
+                        dict_column, nullptr, (*batch_vec)[orc_col_idx->second],
                         _col_orc_type[orc_col_idx->second]);
             } else {
                 string_column = ColumnString::create();
@@ -1971,12 +1976,15 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
     return Status::OK();
 }
 
+// TODO: Possible optimization points.
+//  After filtering the dict column, the null_map for the null dict column should always not be null.
+//  Then it can avoid checking null_map. However, currently when inert materialization is enabled,
+//  the filter column will not be filtered first, but will be filtered together at the end.
 MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
-        const ColumnInt32* dict_column, orc::ColumnVectorBatch* cvb,
+        const ColumnInt32* dict_column, const NullMap* null_map, orc::ColumnVectorBatch* cvb,
         const orc::Type* orc_column_type) {
     SCOPED_RAW_TIMER(&_statistics.decode_value_time);
     auto res = ColumnString::create();
-    const static std::string empty_string;
     auto* encoded_string_vector_batch = static_cast<orc::EncodedStringVectorBatch*>(cvb);
     DCHECK(encoded_string_vector_batch);
     std::vector<StringRef> string_values;
@@ -1984,11 +1992,12 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
     const int* dict_data = dict_column->get_data().data();
     string_values.reserve(num_values);
     size_t max_value_length = 0;
+    auto* null_map_data = null_map->data();
     if (orc_column_type->getKind() == orc::TypeKind::CHAR) {
         // Possibly there are some zero padding characters in CHAR type, we have to strip them off.
-        if (cvb->hasNulls) {
+        if (null_map) {
             for (int i = 0; i < num_values; ++i) {
-                if (cvb->notNull[i]) {
+                if (!null_map_data[i]) {
                     char* val_ptr;
                     int64_t length;
                     encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr,
@@ -2002,7 +2011,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
                     // Orc doesn't fill null values in new batch, but the former batch has been release.
                     // Other types like int/long/timestamp... are flat types without pointer in them,
                     // so other types do not need to be handled separately like string.
-                    string_values.emplace_back(empty_string.data(), 0);
+                    string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
             }
         } else {
@@ -2019,9 +2028,9 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
             }
         }
     } else {
-        if (cvb->hasNulls) {
+        if (null_map) {
             for (int i = 0; i < num_values; ++i) {
-                if (cvb->notNull[i]) {
+                if (!null_map_data[i]) {
                     char* val_ptr;
                     int64_t length;
                     encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr,
@@ -2031,7 +2040,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
                     }
                     string_values.emplace_back(val_ptr, length);
                 } else {
-                    string_values.emplace_back(empty_string.data(), 0);
+                    string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
             }
         } else {
@@ -2047,7 +2056,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
             }
         }
     }
-    res->insert_many_strings(&string_values[0], num_values);
+    res->insert_many_strings_overflow(&string_values[0], num_values, max_value_length);
     return res;
 }
 
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h
index de7a8d182f..3068d55681 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -475,6 +475,7 @@ private:
                                              const std::vector<orc::ColumnVectorBatch*>* batch_vec);
 
     MutableColumnPtr _convert_dict_column_to_string_column(const ColumnInt32* dict_column,
+                                                           const NullMap* null_map,
                                                            orc::ColumnVectorBatch* cvb,
                                                            const orc::Type* orc_column_typ);
 
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
index 76bd89929a..1e09890a98 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -29,8 +29,6 @@
 
 namespace doris::vectorized {
 
-constexpr int MAX_STRINGS_OVERFLOW_SIZE = 128;
-
 Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length,
                                       size_t num_values) {
     _dict = std::move(dict);
@@ -48,7 +46,7 @@ Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t
 
     _dict_value_to_code.reserve(num_values);
     // For insert_many_strings_overflow
-    _dict_data.resize(total_length + MAX_STRINGS_OVERFLOW_SIZE);
+    _dict_data.resize(total_length + ColumnString::MAX_STRINGS_OVERFLOW_SIZE);
     _max_value_length = 0;
     size_t offset = 0;
     offset_cursor = 0;
diff --git a/regression-test/data/external_table_emr_p2/hive/test_external_github.out b/regression-test/data/external_table_emr_p2/hive/test_external_github.out
index fa53b491ff..54f2f21081 100644
--- a/regression-test/data/external_table_emr_p2/hive/test_external_github.out
+++ b/regression-test/data/external_table_emr_p2/hive/test_external_github.out
@@ -2401,6 +2401,44 @@ centaurean/density	988
 google/boringssl	976
 woboq/woboq_codebrowser	916
 
+-- !60 --
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+0000000000009c012a4fcfe3b1f1d683163b0768
+0000000000009c012a4fcfe3b1f1d683163b0768
+0000000001f2ffbd4641882b1b460e0ec21abc42
+000000076395383d824ec8a468a88ee2baa6f121
+000000076395383d824ec8a468a88ee2baa6f121
+0000000bab11354f9a759332065be5f066c3398f
+
+-- !61 --
+fffffe630c2e9b0404d9f3838315e42284d033d6
+fffffa26fa725dbc5e01dd52f18eb63ae5575b8d
+fffff77486c5866ceef27c274849de55ec49e9c0
+fffff600c5b5272a9c40c2f11beb512429b57eed
+fffff427a6fe2f8d73c2ef137570ef93544a8ef5
+fffff23093876194fab64a3fdc182e2c63b49644
+fffff06aa80c33f247249df38a2b3cacecd51653
+ffffe8bc3f37233ebb90f51b68d96cad89d132e6
+ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
+ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
+
+-- !62 --
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+
+-- !63 --
+
 -- !01 --
 Homebrew/homebrew-core	33	15
 pocoproject/poco	28	14
@@ -4803,3 +4841,40 @@ centaurean/density	988
 google/boringssl	976
 woboq/woboq_codebrowser	916
 
+-- !60 --
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+00000000000004a9b86e267361898666ede4eaaa
+0000000000009c012a4fcfe3b1f1d683163b0768
+0000000000009c012a4fcfe3b1f1d683163b0768
+0000000001f2ffbd4641882b1b460e0ec21abc42
+000000076395383d824ec8a468a88ee2baa6f121
+000000076395383d824ec8a468a88ee2baa6f121
+0000000bab11354f9a759332065be5f066c3398f
+
+-- !61 --
+fffffe630c2e9b0404d9f3838315e42284d033d6
+fffffa26fa725dbc5e01dd52f18eb63ae5575b8d
+fffff77486c5866ceef27c274849de55ec49e9c0
+fffff600c5b5272a9c40c2f11beb512429b57eed
+fffff427a6fe2f8d73c2ef137570ef93544a8ef5
+fffff23093876194fab64a3fdc182e2c63b49644
+fffff06aa80c33f247249df38a2b3cacecd51653
+ffffe8bc3f37233ebb90f51b68d96cad89d132e6
+ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
+ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
+
+-- !62 --
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+
+-- !63 --
diff --git a/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy b/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy
index 4d3e636836..362f1e5f61 100644
--- a/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy
+++ b/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy
@@ -495,6 +495,11 @@ suite("test_external_github", "p2") {
         ORDER BY stars DESC
         LIMIT 50"""
 
+    def detail_test1 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id asc limit 10"""
+    def detail_test2 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id desc limit 10"""
+    def detail_test3 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null order by commit_id limit 10"""
+    def detail_test4 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null AND commit_id != "" order by commit_id limit 10"""
+
     String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
     if (enabled != null && enabled.equalsIgnoreCase("true")) {
         String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
@@ -580,6 +585,10 @@ suite("test_external_github", "p2") {
             qt_57 whoAreAllThosePeopleGivingStars1.replace("SUFFIX", format)
             qt_58 whoAreAllThosePeopleGivingStars2.replace("SUFFIX", format)
             qt_59 whoAreAllThosePeopleGivingStars3.replace("SUFFIX", format)
+            qt_60 detail_test1.replace("SUFFIX", format)
+            qt_61 detail_test2.replace("SUFFIX", format)
+            qt_62 detail_test3.replace("SUFFIX", format)
+            qt_63 detail_test4.replace("SUFFIX", format)
         }
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org