You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/09 04:10:49 UTC
[doris] 13/29: [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410)
This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0-beta
in repository https://gitbox.apache.org/repos/asf/doris.git
commit 0014794fb4a7d33b5d8c67d70270109f1d959204
Author: Qi Chen <ka...@gmail.com>
AuthorDate: Fri Jun 9 08:53:01 2023 +0800
[Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410)
Fix some bugs of orc lazy materialization(#18615)
- Fix issue causing column size to continuously increase after `execute_conjuncts()` by calling `Block::erase_useless_column()`.
- Fix partition issues of orc lazy materialization.
- Fix lazy materialization will not be used when the predicate column is inconsistent with the orc file.
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 21 +++++++------
be/src/vec/exec/format/orc/vorc_reader.h | 1 +
.../iceberg/iceberg_partition_upper_case.out | 36 +++++++++++++++++++---
.../iceberg/iceberg_partition_upper_case.groovy | 26 +++++++++++++---
4 files changed, 67 insertions(+), 17 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 3cb31bb262..fdc9e3404d 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -631,11 +631,7 @@ Status OrcReader::set_fill_columns(
std::unordered_map<std::string, std::pair<uint32_t, int>> predicate_columns;
std::function<void(VExpr * expr)> visit_slot = [&](VExpr* expr) {
if (VSlotRef* slot_ref = typeid_cast<VSlotRef*>(expr)) {
- auto expr_name = slot_ref->expr_name();
- auto iter = _col_name_to_file_col_name.find(expr_name);
- if (iter != _col_name_to_file_col_name.end()) {
- expr_name = iter->second;
- }
+ auto& expr_name = slot_ref->expr_name();
predicate_columns.emplace(expr_name,
std::make_pair(slot_ref->column_id(), slot_ref->slot_id()));
if (slot_ref->column_id() == 0) {
@@ -678,6 +674,8 @@ Status OrcReader::set_fill_columns(
} else {
_lazy_read_ctx.predicate_columns.first.emplace_back(iter->first);
_lazy_read_ctx.predicate_columns.second.emplace_back(iter->second.second);
+ _lazy_read_ctx.predicate_orc_columns.emplace_back(
+ _col_name_to_file_col_name[iter->first]);
_lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first);
}
}
@@ -708,6 +706,10 @@ Status OrcReader::set_fill_columns(
_lazy_read_ctx.can_lazy_read = true;
}
+ if (_colname_to_value_range == nullptr || !_init_search_argument(_colname_to_value_range)) {
+ _lazy_read_ctx.can_lazy_read = false;
+ }
+
if (!_lazy_read_ctx.can_lazy_read) {
for (auto& kv : _lazy_read_ctx.predicate_partition_columns) {
_lazy_read_ctx.partition_columns.emplace(kv.first, kv.second);
@@ -722,12 +724,9 @@ Status OrcReader::set_fill_columns(
// create orc row reader
_row_reader_options.range(_range_start_offset, _range_size);
_row_reader_options.setTimezoneName(_ctz);
- if (!_init_search_argument(_colname_to_value_range)) {
- _lazy_read_ctx.can_lazy_read = false;
- }
_row_reader_options.include(_read_cols);
if (_lazy_read_ctx.can_lazy_read) {
- _row_reader_options.filter(_lazy_read_ctx.predicate_columns.first);
+ _row_reader_options.filter(_lazy_read_ctx.predicate_orc_columns);
_orc_filter = std::unique_ptr<ORCFilterImpl>(new ORCFilterImpl(this));
}
try {
@@ -1203,6 +1202,8 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
}
*read_rows = rr;
+ RETURN_IF_ERROR(_fill_partition_columns(block, rr, _lazy_read_ctx.partition_columns));
+ RETURN_IF_ERROR(_fill_missing_columns(block, rr, _lazy_read_ctx.missing_columns));
RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter, *_filter));
Block::erase_useless_column(block, column_to_keep);
} else {
@@ -1265,6 +1266,7 @@ void OrcReader::_fill_batch_vec(std::vector<orc::ColumnVectorBatch*>& result,
Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t size, void* arg) {
Block* block = (Block*)arg;
+ size_t origin_column_num = block->columns();
const auto& batch_vec = down_cast<orc::StructVectorBatch*>(&data)->fields;
for (auto& col_name : _lazy_read_ctx.predicate_columns.first) {
@@ -1312,6 +1314,7 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s
for (auto& col : _lazy_read_ctx.predicate_missing_columns) {
block->get_by_name(col.first).column->assume_mutable()->clear();
}
+ Block::erase_useless_column(block, origin_column_num);
}
uint16_t new_size = 0;
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h
index e4539b2180..1230b782f8 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -102,6 +102,7 @@ struct LazyReadContext {
// be different with orc column name
// std::pair<std::list<col_name>, std::vector<slot_id>>
std::pair<std::list<std::string>, std::vector<int>> predicate_columns;
+ // predicate orc file column names
std::list<std::string> predicate_orc_columns;
std::vector<std::string> lazy_read_columns;
std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>
diff --git a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
index 9da2a5b80f..376a9495b0 100644
--- a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
+++ b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out
@@ -26,31 +26,45 @@ Shanghai
-- !orcupper5 --
2 k2_2 k3_2 Beijing
+-- !orcupper6 --
+1 k2_1 k3_1 Beijing
+
+-- !orcupper7 --
+1 k2_1 k3_1 Beijing
+2 k2_2 k3_2 Beijing
+
-- !orclower1 --
1 k2_1 k3_1 Beijing
2 k2_2 k3_2 Beijing
3 k2_3 k3_3 Shanghai
4 k2_4 k3_4 Shanghai
--- !orclower1 --
+-- !orclower2 --
1 Beijing
2 Beijing
3 Shanghai
4 Shanghai
--- !orclower1 --
+-- !orclower3 --
1 k2_1
2 k2_2
3 k2_3
4 k2_4
--- !orclower1 --
+-- !orclower4 --
Beijing
Beijing
Shanghai
Shanghai
--- !orclower1 --
+-- !orclower5 --
+2 k2_2 k3_2 Beijing
+
+-- !orclower6 --
+1 k2_1 k3_1 Beijing
+
+-- !orclower7 --
+1 k2_1 k3_1 Beijing
2 k2_2 k3_2 Beijing
-- !parquetupper1 --
@@ -84,6 +98,13 @@ Shanghai
3 k2_3 k3_3 Shanghai
4 k2_4 k3_4 Shanghai
+-- !parquetupper7 --
+1 k2_1 k3_1 Beijing
+
+-- !parquetupper8 --
+1 k2_1 k3_1 Beijing
+2 k2_2 k3_2 Beijing
+
-- !parquetlower1 --
1 k2_1 k3_1 Beijing
2 k2_2 k3_2 Beijing
@@ -115,3 +136,10 @@ Shanghai
3 k2_3 k3_3 Shanghai
4 k2_4 k3_4 Shanghai
+-- !parquetupper7 --
+1 k2_1 k3_1 Beijing
+
+-- !parquetupper8 --
+1 k2_1 k3_1 Beijing
+2 k2_2 k3_2 Beijing
+
diff --git a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
index efc2aaf77f..b4957495dc 100644
--- a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
+++ b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy
@@ -21,12 +21,16 @@ suite("iceberg_partition_upper_case", "p2") {
def orc_upper3 = """select k1, k2 from iceberg_partition_upper_case_orc order by k1;"""
def orc_upper4 = """select city from iceberg_partition_upper_case_orc order by city;"""
def orc_upper5 = """select * from iceberg_partition_upper_case_orc where k1>1 and city='Beijing' order by k1;"""
+ def orc_upper6 = """select * from iceberg_partition_upper_case_orc where k1=1 order by k1;"""
+ def orc_upper7 = """select * from iceberg_partition_upper_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;"""
def orc_lower1 = """select * from iceberg_partition_lower_case_orc order by k1;"""
def orc_lower2 = """select k1, city from iceberg_partition_lower_case_orc order by k1;"""
def orc_lower3 = """select k1, k2 from iceberg_partition_lower_case_orc order by k1;"""
def orc_lower4 = """select city from iceberg_partition_lower_case_orc order by city;"""
def orc_lower5 = """select * from iceberg_partition_lower_case_orc where k1>1 and city='Beijing' order by k1;"""
+ def orc_lower6 = """select * from iceberg_partition_lower_case_orc where k1=1 order by k1;"""
+ def orc_lower7 = """select * from iceberg_partition_lower_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;"""
def parquet_upper1 = """select * from iceberg_partition_upper_case_parquet order by k1;"""
def parquet_upper2 = """select k1, city from iceberg_partition_upper_case_parquet order by k1;"""
@@ -34,6 +38,8 @@ suite("iceberg_partition_upper_case", "p2") {
def parquet_upper4 = """select city from iceberg_partition_upper_case_parquet order by city;"""
def parquet_upper5 = """select * from iceberg_partition_upper_case_parquet where k1>1 and city='Beijing' order by k1;"""
def parquet_upper6 = """select * from iceberg_partition_upper_case_parquet where substring(city, 6)='hai' order by k1;"""
+ def parquet_upper7 = """select * from iceberg_partition_upper_case_parquet where k1=1 order by k1;"""
+ def parquet_upper8 = """select * from iceberg_partition_upper_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;"""
def parquet_lower1 = """select * from iceberg_partition_lower_case_parquet order by k1;"""
def parquet_lower2 = """select k1, city from iceberg_partition_lower_case_parquet order by k1;"""
@@ -41,6 +47,8 @@ suite("iceberg_partition_upper_case", "p2") {
def parquet_lower4 = """select city from iceberg_partition_lower_case_parquet order by city;"""
def parquet_lower5 = """select * from iceberg_partition_lower_case_parquet where k1>1 and city='Beijing' order by k1;"""
def parquet_lower6 = """select * from iceberg_partition_lower_case_parquet where substring(city, 6)='hai' order by k1;"""
+ def parquet_lower7 = """select * from iceberg_partition_lower_case_parquet where k1=1 order by k1;"""
+ def parquet_lower8 = """select * from iceberg_partition_lower_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;"""
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
@@ -63,23 +71,33 @@ suite("iceberg_partition_upper_case", "p2") {
qt_orcupper3 orc_upper3
qt_orcupper4 orc_upper4
qt_orcupper5 orc_upper5
+ qt_orcupper6 orc_upper6
+ qt_orcupper7 orc_upper7
+
qt_orclower1 orc_lower1
- qt_orclower1 orc_lower2
- qt_orclower1 orc_lower3
- qt_orclower1 orc_lower4
- qt_orclower1 orc_lower5
+ qt_orclower2 orc_lower2
+ qt_orclower3 orc_lower3
+ qt_orclower4 orc_lower4
+ qt_orclower5 orc_lower5
+ qt_orclower6 orc_lower6
+ qt_orclower7 orc_lower7
qt_parquetupper1 parquet_upper1
qt_parquetupper2 parquet_upper2
qt_parquetupper3 parquet_upper3
qt_parquetupper4 parquet_upper4
qt_parquetupper5 parquet_upper5
qt_parquetupper6 parquet_upper6
+ qt_parquetupper7 parquet_upper7
+ qt_parquetupper8 parquet_upper8
qt_parquetlower1 parquet_lower1
qt_parquetlower2 parquet_lower2
qt_parquetlower3 parquet_lower3
qt_parquetlower4 parquet_lower4
qt_parquetlower5 parquet_lower5
qt_parquetlower6 parquet_lower6
+ qt_parquetupper7 parquet_upper7
+ qt_parquetupper8 parquet_upper8
}
}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org