You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/25 12:23:52 UTC

[impala] 05/05: IMPALA-11344: Missing slots in all cases should be allowed to be read

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 44dc157a2c10578b82518012aa2e9aa9288dc6e5
Author: ttttttz <24...@qq.com>
AuthorDate: Wed Jun 22 11:53:28 2022 +0800

    IMPALA-11344: Missing slots in all cases should be allowed to be read
    
    When selecting only the missing fields of ORC files and the missing fields
    contain non-partition fields, the query will fail due to `Parse error in
    possibly corrupt ORC file: '$filename'. No columns found for this scan`.
    We should allow read missing slots in all cases.
    
    Testing:
    - Added a test to test_scanners.py that ensures the query can be
      executed successfully when selecting only the missing fields of
      ORC files.
    Change-Id: I15dca47ba5f7a93bfd5fcba3cab4ac6d64459023
    Reviewed-on: http://gerrit.cloudera.org:8080/18652
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18907
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
---
 be/src/exec/orc-column-readers.cc | 23 +--------------
 tests/query_test/test_scanners.py | 60 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 644ac325f..7c9ae072d 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -530,28 +530,7 @@ Status OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch,
   }
   int num_rows_read = scratch_batch->num_tuples - scratch_batch_idx;
   if (children_.empty()) {
-    // We allow empty 'children_' for original files, because we might select the
-    // synthetic 'rowid' field which is not present in original files.
-    // We also allow empty 'children_' when we need to validate row batches of a zero slot
-    // scan. In that case 'children_' is empty and only 'row_validator_' owns an ORC
-    // vector batch (the write id batch).
-    bool valid_empty_children = scanner_->acid_original_file_ ||
-         (scanner_->row_batches_need_validation_ &&
-          scanner_->scan_node_->IsZeroSlotTableScan());
-    if (!valid_empty_children) {
-      bool only_partitions = true;
-      for (SlotDescriptor* slot : tuple_desc_->slots()) {
-        if (!scanner_->IsPartitionKeySlot(slot)) {
-          only_partitions = false;
-          break;
-        }
-      }
-      if (!only_partitions) {
-        return Status(Substitute("Parse error in possibly corrupt ORC file: '$0'. "
-            "No columns found for this scan.",
-            scanner_->filename()));
-      }
-    }
+    // We allow empty 'children_' in all cases.
     DCHECK_EQ(0, num_rows_read);
     num_rows_read = std::min(scratch_batch->capacity - scratch_batch->num_tuples,
                              NumElements() - row_idx_);
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 112b53dc0..ca15d43a2 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1734,6 +1734,66 @@ class TestOrc(ImpalaTestSuite):
 
     self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, unique_database)
 
+  @SkipIfABFS.hive
+  @SkipIfADLS.hive
+  @SkipIfIsilon.hive
+  @SkipIfLocal.hive
+  @SkipIfS3.hive
+  @SkipIfGCS.hive
+  @SkipIfCOS.hive
+  def test_missing_field_orc(self, unique_database):
+    # Test scanning orc files with missing fields in file meta.
+    orc_tbl_name = unique_database + ".missing_field_orc"
+    self.client.execute("create table %s (f0 int) stored as orc" % orc_tbl_name)
+    self.run_stmt_in_hive("insert into table %s select 1" % orc_tbl_name)
+    self.client.execute("refresh %s" % orc_tbl_name)
+
+    self.client.execute("alter table %s add columns(f1 int)" % orc_tbl_name)
+    result = self.client.execute("select f1 from %s " % orc_tbl_name)
+    assert result.data == ['NULL']
+
+    self.client.execute("alter table %s add columns(f2 STRUCT<s0:STRING, s1:STRING>)"
+                        % orc_tbl_name)
+    result = self.client.execute("select f2.s0 from %s " % orc_tbl_name)
+    assert result.data == ['NULL']
+
+    orc_tbl_name = unique_database + ".missing_field_full_txn_test"
+    self.client.execute("create table %s(f0 int) stored as orc "
+                        "tblproperties('transactional'='true')" % orc_tbl_name)
+    self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name)
+    self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name)
+    self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name)
+    self.client.execute("refresh %s" % orc_tbl_name)
+    result = self.client.execute("select f1 from %s" % orc_tbl_name)
+    assert len(result.data) == 2
+    assert '1' in result.data
+    assert 'NULL' in result.data
+
+    # TODO: add a test case for Iceberg tables once IMPALA-10542 is done.
+    # orc_tbl_name = unique_database + ".missing_field_iceberg_test"
+    # self.client.execute("create table %s (f0 int) stored as iceberg "
+    #                     "tblproperties('write.format.default' = 'orc')"
+    #                     % orc_tbl_name)
+    # self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name)
+    # self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name)
+    # self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name)
+    # self.client.execute("refresh %s" % orc_tbl_name)
+    # result = self.client.execute("select f1 from %s" % orc_tbl_name)
+    # assert len(result.data) == 2
+    # assert '1' in result.data
+    # assert 'NULL' in result.data
+
+    orc_tbl_name = unique_database + ".lineitem_orc_ext"
+    test_file = "/test-warehouse/tpch.lineitem_orc_def"
+    create_sql = "create external table %s like tpch_orc_def.lineitem " \
+                 "location '%s'" % (orc_tbl_name, test_file)
+    self.client.execute(create_sql)
+    self.client.execute("alter table %s add columns (new_col int)" % orc_tbl_name)
+    result = self.execute_query("select count(*) from %s where new_col is null"
+                                % orc_tbl_name)
+    assert len(result.data) == 1
+    assert '6001215' in result.data
+
 
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod