You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/25 12:23:52 UTC
[impala] 05/05: IMPALA-11344: Missing slots in all cases should be allowed to be read
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 44dc157a2c10578b82518012aa2e9aa9288dc6e5
Author: ttttttz <24...@qq.com>
AuthorDate: Wed Jun 22 11:53:28 2022 +0800
IMPALA-11344: Missing slots in all cases should be allowed to be read
When selecting only the missing fields of ORC files and the missing fields
contain non-partition fields, the query will fail due to `Parse error in
possibly corrupt ORC file: '$filename'. No columns found for this scan`.
We should allow read missing slots in all cases.
Testing:
- Added a test to test_scanners.py that ensures the query can be
executed successfully when selecting only the missing fields of
ORC files.
Change-Id: I15dca47ba5f7a93bfd5fcba3cab4ac6d64459023
Reviewed-on: http://gerrit.cloudera.org:8080/18652
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-on: http://gerrit.cloudera.org:8080/18907
Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
---
be/src/exec/orc-column-readers.cc | 23 +--------------
tests/query_test/test_scanners.py | 60 +++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 644ac325f..7c9ae072d 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -530,28 +530,7 @@ Status OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch,
}
int num_rows_read = scratch_batch->num_tuples - scratch_batch_idx;
if (children_.empty()) {
- // We allow empty 'children_' for original files, because we might select the
- // synthetic 'rowid' field which is not present in original files.
- // We also allow empty 'children_' when we need to validate row batches of a zero slot
- // scan. In that case 'children_' is empty and only 'row_validator_' owns an ORC
- // vector batch (the write id batch).
- bool valid_empty_children = scanner_->acid_original_file_ ||
- (scanner_->row_batches_need_validation_ &&
- scanner_->scan_node_->IsZeroSlotTableScan());
- if (!valid_empty_children) {
- bool only_partitions = true;
- for (SlotDescriptor* slot : tuple_desc_->slots()) {
- if (!scanner_->IsPartitionKeySlot(slot)) {
- only_partitions = false;
- break;
- }
- }
- if (!only_partitions) {
- return Status(Substitute("Parse error in possibly corrupt ORC file: '$0'. "
- "No columns found for this scan.",
- scanner_->filename()));
- }
- }
+ // We allow empty 'children_' in all cases.
DCHECK_EQ(0, num_rows_read);
num_rows_read = std::min(scratch_batch->capacity - scratch_batch->num_tuples,
NumElements() - row_idx_);
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 112b53dc0..ca15d43a2 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1734,6 +1734,66 @@ class TestOrc(ImpalaTestSuite):
self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, unique_database)
+ @SkipIfABFS.hive
+ @SkipIfADLS.hive
+ @SkipIfIsilon.hive
+ @SkipIfLocal.hive
+ @SkipIfS3.hive
+ @SkipIfGCS.hive
+ @SkipIfCOS.hive
+ def test_missing_field_orc(self, unique_database):
+ # Test scanning orc files with missing fields in file meta.
+ orc_tbl_name = unique_database + ".missing_field_orc"
+ self.client.execute("create table %s (f0 int) stored as orc" % orc_tbl_name)
+ self.run_stmt_in_hive("insert into table %s select 1" % orc_tbl_name)
+ self.client.execute("refresh %s" % orc_tbl_name)
+
+ self.client.execute("alter table %s add columns(f1 int)" % orc_tbl_name)
+ result = self.client.execute("select f1 from %s " % orc_tbl_name)
+ assert result.data == ['NULL']
+
+ self.client.execute("alter table %s add columns(f2 STRUCT<s0:STRING, s1:STRING>)"
+ % orc_tbl_name)
+ result = self.client.execute("select f2.s0 from %s " % orc_tbl_name)
+ assert result.data == ['NULL']
+
+ orc_tbl_name = unique_database + ".missing_field_full_txn_test"
+ self.client.execute("create table %s(f0 int) stored as orc "
+ "tblproperties('transactional'='true')" % orc_tbl_name)
+ self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name)
+ self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name)
+ self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name)
+ self.client.execute("refresh %s" % orc_tbl_name)
+ result = self.client.execute("select f1 from %s" % orc_tbl_name)
+ assert len(result.data) == 2
+ assert '1' in result.data
+ assert 'NULL' in result.data
+
+ # TODO: add a test case for Iceberg tables once IMPALA-10542 is done.
+ # orc_tbl_name = unique_database + ".missing_field_iceberg_test"
+ # self.client.execute("create table %s (f0 int) stored as iceberg "
+ # "tblproperties('write.format.default' = 'orc')"
+ # % orc_tbl_name)
+ # self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name)
+ # self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name)
+ # self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name)
+ # self.client.execute("refresh %s" % orc_tbl_name)
+ # result = self.client.execute("select f1 from %s" % orc_tbl_name)
+ # assert len(result.data) == 2
+ # assert '1' in result.data
+ # assert 'NULL' in result.data
+
+ orc_tbl_name = unique_database + ".lineitem_orc_ext"
+ test_file = "/test-warehouse/tpch.lineitem_orc_def"
+ create_sql = "create external table %s like tpch_orc_def.lineitem " \
+ "location '%s'" % (orc_tbl_name, test_file)
+ self.client.execute(create_sql)
+ self.client.execute("alter table %s add columns (new_col int)" % orc_tbl_name)
+ result = self.execute_query("select count(*) from %s where new_col is null"
+ % orc_tbl_name)
+ assert len(result.data) == 1
+ assert '6001215' in result.data
+
class TestScannerReservation(ImpalaTestSuite):
@classmethod