You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/01/10 23:13:41 UTC
[impala] 01/03: IMPALA-9277: Catch exception thrown from orc::ColumnSelector::updateSelectedByTypeId

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8f448cfc6bb3c101ab4737eecd1e84a858744793
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Thu Jan 9 18:16:57 2020 +0100

    IMPALA-9277: Catch exception thrown from orc::ColumnSelector::updateSelectedByTypeId
    
    orc::ColumnSelector::updateSelectedByTypeId can throw an exception on
    malformed ORC files. The exception wasn't caught by Impala therefore it
    caused program termination.
    
    The fix is to simply catch the exception and return with a parse error
    instead.
    
    Testing:
    * added corrupt ORC file and e2e test
    
    Change-Id: I2f706bc832298cb5089e539b7a818cb86d02199f
    Reviewed-on: http://gerrit.cloudera.org:8080/14994
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-orc-scanner.cc   |  19 +++++++++++++------
 testdata/data/README              |   3 +++
 testdata/data/corrupt_schema.orc  | Bin 0 -> 1958 bytes
 tests/query_test/test_scanners.py |  10 ++++++++++
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 0c5739b..164feaa 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -194,12 +194,19 @@ Status HdfsOrcScanner::Open(ScannerContext* context) {
   // ancestors and children will be selected too.
   // Here we haven't read stripe data yet so no orc::RowReaders are created. To get the
   // selected types we create a temp orc::RowReader (but won't read rows from it).
-  unique_ptr<orc::RowReader> tmp_row_reader =
-      reader_->createRowReader(row_reader_options_);
-  const orc::Type* root_type = &tmp_row_reader->getSelectedType();
-  DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
-  orc_root_reader_ = this->obj_pool_.Add(
-      new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  try {
+    unique_ptr<orc::RowReader> tmp_row_reader =
+        reader_->createRowReader(row_reader_options_);
+    const orc::Type* root_type = &tmp_row_reader->getSelectedType();
+    DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
+    orc_root_reader_ = this->obj_pool_.Add(
+        new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  } catch (std::exception& e) {
+    string msg = Substitute("Encountered parse error during schema selection in "
+        "ORC file $0: $1", filename(), e.what());
+    parse_status_ = Status(msg);
+    return parse_status_;
+  }
 
   // Set top-level template tuple.
   template_tuple_ = template_tuple_map_[scan_node_->tuple_desc()];
diff --git a/testdata/data/README b/testdata/data/README
index 8e7e7a3..a8f20c2 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -455,3 +455,6 @@ foreign key referring to parent_table_2's primary column 'a'.
 out_of_range_timestamp.orc:
 Created with Hive. ORC file with a single timestamp column 'ts'.
 Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
+
+corrupt_schema.orc:
+ORC file from IMPALA-9277, generated by fuzz test. The file contains malformed metadata.
diff --git a/testdata/data/corrupt_schema.orc b/testdata/data/corrupt_schema.orc
new file mode 100644
index 0000000..86d2afe
Binary files /dev/null and b/testdata/data/corrupt_schema.orc differ
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index ea71911..1d26463 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1334,6 +1334,16 @@ class TestOrc(ImpalaTestSuite):
       self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
                          new_vector, unique_database)
 
+  def test_invalid_schema(self, vector, unique_database):
+    """Test scanning of ORC file with malformed schema."""
+    test_files = ["testdata/data/corrupt_schema.orc"]
+    create_table_and_copy_files(self.client,
+        "CREATE TABLE {db}.{tbl} (id BIGINT) STORED AS ORC",
+        unique_database, "corrupt_schema", test_files)
+    err = self.execute_query_expect_failure(self.client,
+        "select count(*) from {0}.{1}".format(unique_database, "corrupt_schema"))
+    assert "Encountered parse error during schema selection" in str(err)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):