You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/06/09 00:53:08 UTC

[doris] branch master updated: [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 845d459f05 [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410)
845d459f05 is described below

commit 845d459f055f792c828c83f9334a9e49174b3ee4
Author: Qi Chen <ka...@gmail.com>
AuthorDate: Fri Jun 9 08:53:01 2023 +0800

    [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410)
    
    Fix some bugs of orc lazy materialization(#18615)
    - Fix issue causing column size to continuously increase after `execute_conjuncts()` by calling `Block::erase_useless_column()`.
    - Fix partition issues of orc lazy materialization.
    - Fix lazy materialization will not be used when the predicate column is inconsistent with the orc file.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         | 21 +++++++------
 be/src/vec/exec/format/orc/vorc_reader.h           |  1 +
 .../iceberg/iceberg_partition_upper_case.out       | 36 +++++++++++++++++++---
 .../iceberg/iceberg_partition_upper_case.groovy    | 26 +++++++++++++---
 4 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 0ba81cf986..7d92a9f5e1 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -637,11 +637,7 @@ Status OrcReader::set_fill_columns(
     std::unordered_map<std::string, std::pair<uint32_t, int>> predicate_columns;
     std::function<void(VExpr * expr)> visit_slot = [&](VExpr* expr) {
         if (VSlotRef* slot_ref = typeid_cast<VSlotRef*>(expr)) {
-            auto expr_name = slot_ref->expr_name();
-            auto iter = _col_name_to_file_col_name.find(expr_name);
-            if (iter != _col_name_to_file_col_name.end()) {
-                expr_name = iter->second;
-            }
+            auto& expr_name = slot_ref->expr_name();
             predicate_columns.emplace(expr_name,
                                       std::make_pair(slot_ref->column_id(), slot_ref->slot_id()));
             if (slot_ref->column_id() == 0) {
@@ -684,6 +680,8 @@ Status OrcReader::set_fill_columns(
             } else {
                 _lazy_read_ctx.predicate_columns.first.emplace_back(iter->first);
                 _lazy_read_ctx.predicate_columns.second.emplace_back(iter->second.second);
+                _lazy_read_ctx.predicate_orc_columns.emplace_back(
+                        _col_name_to_file_col_name[iter->first]);
                 _lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first);
             }
         }
@@ -714,6 +712,10 @@ Status OrcReader::set_fill_columns(
         _lazy_read_ctx.can_lazy_read = true;
     }
 
+    if (_colname_to_value_range == nullptr || !_init_search_argument(_colname_to_value_range)) {
+        _lazy_read_ctx.can_lazy_read = false;
+    }
+
     if (!_lazy_read_ctx.can_lazy_read) {
         for (auto& kv : _lazy_read_ctx.predicate_partition_columns) {
             _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second);
@@ -728,12 +730,9 @@ Status OrcReader::set_fill_columns(
     // create orc row reader
     _row_reader_options.range(_range_start_offset, _range_size);
     _row_reader_options.setTimezoneName(_ctz);
-    if (!_init_search_argument(_colname_to_value_range)) {
-        _lazy_read_ctx.can_lazy_read = false;
-    }
     _row_reader_options.include(_read_cols);
     if (_lazy_read_ctx.can_lazy_read) {
-        _row_reader_options.filter(_lazy_read_ctx.predicate_columns.first);
+        _row_reader_options.filter(_lazy_read_ctx.predicate_orc_columns);
         _orc_filter = std::unique_ptr<ORCFilterImpl>(new ORCFilterImpl(this));
     }
     try {
@@ -1209,6 +1208,8 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
         }
         *read_rows = rr;
 
+        RETURN_IF_ERROR(_fill_partition_columns(block, rr, _lazy_read_ctx.partition_columns));
+        RETURN_IF_ERROR(_fill_missing_columns(block, rr, _lazy_read_ctx.missing_columns));
         RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter, *_filter));
         Block::erase_useless_column(block, column_to_keep);
     } else {
@@ -1271,6 +1272,7 @@ void OrcReader::_fill_batch_vec(std::vector<orc::ColumnVectorBatch*>& result,
 
 Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t size, void* arg) {
     Block* block = (Block*)arg;
+    size_t origin_column_num = block->columns();
 
     const auto& batch_vec = down_cast<orc::StructVectorBatch*>(&data)->fields;
     for (auto& col_name : _lazy_read_ctx.predicate_columns.first) {
@@ -1318,6 +1320,7 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s
         for (auto& col : _lazy_read_ctx.predicate_missing_columns) {
             block->get_by_name(col.first).column->assume_mutable()->clear();
         }
+        Block::erase_useless_column(block, origin_column_num);
     }
 
     uint16_t new_size = 0;
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h
index 7b5f808ce8..13a2d7265c 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -102,6 +102,7 @@ struct LazyReadContext {
     // be different with orc column name
     // std::pair<std::list<col_name>, std::vector<slot_id>>
     std::pair<std::list<std::string>, std::vector<int>> predicate_columns;
+    // predicate orc file column names
     std::list<std::string> predicate_orc_columns;
     std::vector<std::string> lazy_read_columns;
     std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>
diff --git a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
index 9da2a5b80f..376a9495b0 100644
--- a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
+++ b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
@@ -26,31 +26,45 @@ Shanghai
 -- !orcupper5 --
 2	k2_2	k3_2	Beijing
 
+-- !orcupper6 --
+1	k2_1	k3_1	Beijing
+
+-- !orcupper7 --
+1	k2_1	k3_1	Beijing
+2	k2_2	k3_2	Beijing
+
 -- !orclower1 --
 1	k2_1	k3_1	Beijing
 2	k2_2	k3_2	Beijing
 3	k2_3	k3_3	Shanghai
 4	k2_4	k3_4	Shanghai
 
--- !orclower1 --
+-- !orclower2 --
 1	Beijing
 2	Beijing
 3	Shanghai
 4	Shanghai
 
--- !orclower1 --
+-- !orclower3 --
 1	k2_1
 2	k2_2
 3	k2_3
 4	k2_4
 
--- !orclower1 --
+-- !orclower4 --
 Beijing
 Beijing
 Shanghai
 Shanghai
 
--- !orclower1 --
+-- !orclower5 --
+2	k2_2	k3_2	Beijing
+
+-- !orclower6 --
+1	k2_1	k3_1	Beijing
+
+-- !orclower7 --
+1	k2_1	k3_1	Beijing
 2	k2_2	k3_2	Beijing
 
 -- !parquetupper1 --
@@ -84,6 +98,13 @@ Shanghai
 3	k2_3	k3_3	Shanghai
 4	k2_4	k3_4	Shanghai
 
+-- !parquetupper7 --
+1	k2_1	k3_1	Beijing
+
+-- !parquetupper8 --
+1	k2_1	k3_1	Beijing
+2	k2_2	k3_2	Beijing
+
 -- !parquetlower1 --
 1	k2_1	k3_1	Beijing
 2	k2_2	k3_2	Beijing
@@ -115,3 +136,10 @@ Shanghai
 3	k2_3	k3_3	Shanghai
 4	k2_4	k3_4	Shanghai
 
+-- !parquetupper7 --
+1	k2_1	k3_1	Beijing
+
+-- !parquetupper8 --
+1	k2_1	k3_1	Beijing
+2	k2_2	k3_2	Beijing
+
diff --git a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
index efc2aaf77f..b4957495dc 100644
--- a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
+++ b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
@@ -21,12 +21,16 @@ suite("iceberg_partition_upper_case", "p2") {
     def orc_upper3 = """select k1, k2 from iceberg_partition_upper_case_orc order by k1;"""
     def orc_upper4 = """select city from iceberg_partition_upper_case_orc order by city;"""
     def orc_upper5 = """select * from iceberg_partition_upper_case_orc where k1>1 and city='Beijing' order by k1;"""
+    def orc_upper6 = """select * from iceberg_partition_upper_case_orc where k1=1 order by k1;"""
+    def orc_upper7 = """select * from iceberg_partition_upper_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;"""
 
     def orc_lower1 = """select * from iceberg_partition_lower_case_orc order by k1;"""
     def orc_lower2 = """select k1, city from iceberg_partition_lower_case_orc order by k1;"""
     def orc_lower3 = """select k1, k2 from iceberg_partition_lower_case_orc order by k1;"""
     def orc_lower4 = """select city from iceberg_partition_lower_case_orc order by city;"""
     def orc_lower5 = """select * from iceberg_partition_lower_case_orc where k1>1 and city='Beijing' order by k1;"""
+    def orc_lower6 = """select * from iceberg_partition_lower_case_orc where k1=1 order by k1;"""
+    def orc_lower7 = """select * from iceberg_partition_lower_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;"""
 
     def parquet_upper1 = """select * from iceberg_partition_upper_case_parquet order by k1;"""
     def parquet_upper2 = """select k1, city from iceberg_partition_upper_case_parquet order by k1;"""
@@ -34,6 +38,8 @@ suite("iceberg_partition_upper_case", "p2") {
     def parquet_upper4 = """select city from iceberg_partition_upper_case_parquet order by city;"""
     def parquet_upper5 = """select * from iceberg_partition_upper_case_parquet where k1>1 and city='Beijing' order by k1;"""
     def parquet_upper6 = """select * from iceberg_partition_upper_case_parquet where substring(city, 6)='hai' order by k1;"""
+    def parquet_upper7 = """select * from iceberg_partition_upper_case_parquet where k1=1 order by k1;"""
+    def parquet_upper8 = """select * from iceberg_partition_upper_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;"""
 
     def parquet_lower1 = """select * from iceberg_partition_lower_case_parquet order by k1;"""
     def parquet_lower2 = """select k1, city from iceberg_partition_lower_case_parquet order by k1;"""
@@ -41,6 +47,8 @@ suite("iceberg_partition_upper_case", "p2") {
     def parquet_lower4 = """select city from iceberg_partition_lower_case_parquet order by city;"""
     def parquet_lower5 = """select * from iceberg_partition_lower_case_parquet where k1>1 and city='Beijing' order by k1;"""
     def parquet_lower6 = """select * from iceberg_partition_lower_case_parquet where substring(city, 6)='hai' order by k1;"""
+    def parquet_lower7 = """select * from iceberg_partition_lower_case_parquet where k1=1 order by k1;"""
+    def parquet_lower8 = """select * from iceberg_partition_lower_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;"""
 
     String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
     if (enabled != null && enabled.equalsIgnoreCase("true")) {
@@ -63,23 +71,33 @@ suite("iceberg_partition_upper_case", "p2") {
         qt_orcupper3 orc_upper3
         qt_orcupper4 orc_upper4
         qt_orcupper5 orc_upper5
+        qt_orcupper6 orc_upper6
+        qt_orcupper7 orc_upper7
+
         qt_orclower1 orc_lower1
-        qt_orclower1 orc_lower2
-        qt_orclower1 orc_lower3
-        qt_orclower1 orc_lower4
-        qt_orclower1 orc_lower5
+        qt_orclower2 orc_lower2
+        qt_orclower3 orc_lower3
+        qt_orclower4 orc_lower4
+        qt_orclower5 orc_lower5
+        qt_orclower6 orc_lower6
+        qt_orclower7 orc_lower7
         qt_parquetupper1 parquet_upper1
         qt_parquetupper2 parquet_upper2
         qt_parquetupper3 parquet_upper3
         qt_parquetupper4 parquet_upper4
         qt_parquetupper5 parquet_upper5
         qt_parquetupper6 parquet_upper6
+        qt_parquetupper7 parquet_upper7
+        qt_parquetupper8 parquet_upper8
         qt_parquetlower1 parquet_lower1
         qt_parquetlower2 parquet_lower2
         qt_parquetlower3 parquet_lower3
         qt_parquetlower4 parquet_lower4
         qt_parquetlower5 parquet_lower5
         qt_parquetlower6 parquet_lower6
+        qt_parquetupper7 parquet_upper7
+        qt_parquetupper8 parquet_upper8
     }
 }
 
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org