You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2019/12/09 14:24:35 UTC

[impala] 02/03: IMPALA-8184: Add timestamp validation to ORC scanner

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f33a9d0d426f2cbaaf225d7ea08b15966e537f31
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Dec 4 20:15:31 2019 +0100

    IMPALA-8184: Add timestamp validation to ORC scanner
    
    Hive can write timestamps that are outside Impala's valid
    range (Impala: 1400-9999 Hive: 0001-9999). This change adds
    validation logic to ORC reading that replaces out-of-range
    timestamps with NULLs and adds a warning to the query.
    
    The logic is very similar to the existing validation in
    Parquet. Some differences:
    - "time of day" is not checked separately as it doesn't make
      sense with ORC's encoding
    - instead of column name only column id is added to the warning
    
    Testing:
    - added a simple EE test that scans an existing ORC file
    
    Change-Id: I8ee2ba83a54f93d37e8832e064f2c8418b503490
    Reviewed-on: http://gerrit.cloudera.org:8080/14832
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/orc-column-readers.cc                       |   6 ++++++
 common/thrift/generate_error_codes.py                   |   4 ++++
 testdata/data/README                                    |   6 +++++-
 testdata/data/out_of_range_timestamp.orc                | Bin 0 -> 229 bytes
 .../DataErrorsTest/orc-out-of-range-timestamp.test      |  16 ++++++++++++++++
 tests/query_test/test_scanners.py                       |  11 +++++++++++
 6 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 05e8ea2..69128b9 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -191,6 +191,12 @@ Status OrcTimestampReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
   auto slot = reinterpret_cast<TimestampValue*>(GetSlot(tuple));
   *slot = TimestampValue::FromUnixTimeNanos(secs, nanos,
       scanner_->state_->local_time_zone());
+  if (UNLIKELY(!slot->HasDate())) {
+    SetNullSlot(tuple);
+    TErrorCode::type errorCode = TErrorCode::ORC_TIMESTAMP_OUT_OF_RANGE;
+    ErrorMsg msg(errorCode, scanner_->filename(), orc_column_id_);
+    return scanner_->state_->LogOrReturnError(msg);
+  }
   return Status::OK();
 }
 
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 8f0cbaa..3da3061 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -438,6 +438,10 @@ error_codes = (
 
   ("AVRO_INVALID_DATE", 144, "Avro file '$0' is corrupt: out of range date value $1 "
    "at offset $2. The valid date range is -719162..2932896 (0001-01-01..9999-12-31)."),
+
+  ("ORC_TIMESTAMP_OUT_OF_RANGE", 145,
+   "ORC file '$0' column '$1' contains an out of range timestamp. "
+   "The valid date range is 1400-01-01..9999-12-31."),
 )
 
 import sys
diff --git a/testdata/data/README b/testdata/data/README
index bbffeb4..8e7e7a3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -450,4 +450,8 @@ relationships along with parent_table and child_table.
 
 child_table:
 Created manually. Contains four columns. 'seq' column is the primary key of this table. ('id', 'year') form a foreign key referring to parent_table('id', 'year') and 'a' is a
-foreign key referring to parent_table_2's primary column 'a'.
\ No newline at end of file
+foreign key referring to parent_table_2's primary column 'a'.
+
+out_of_range_timestamp.orc:
+Created with Hive. ORC file with a single timestamp column 'ts'.
+Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
diff --git a/testdata/data/out_of_range_timestamp.orc b/testdata/data/out_of_range_timestamp.orc
new file mode 100644
index 0000000..268b900
Binary files /dev/null and b/testdata/data/out_of_range_timestamp.orc differ
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
new file mode 100644
index 0000000..c39cd21
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
@@ -0,0 +1,16 @@
+====
+---- QUERY
+SET abort_on_error=1;
+SELECT * FROM out_of_range_timestamp;
+---- CATCH
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
+====
+---- QUERY
+SET abort_on_error=0;
+SELECT * FROM out_of_range_timestamp;
+---- TYPES
+TIMESTAMP
+---- RESULTS
+NULL
+---- ERRORS
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 5b17ecb..ea71911 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1323,6 +1323,17 @@ class TestOrc(ImpalaTestSuite):
 
     self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
 
+  def test_orc_timestamp_out_of_range(self, vector, unique_database):
+      """Test the validation of out-of-range timestamps."""
+      test_files = ["testdata/data/out_of_range_timestamp.orc"]
+      create_table_and_copy_files(self.client, "create table {db}.{tbl} "
+                                               "(ts timestamp) stored as orc",
+                                  unique_database, "out_of_range_timestamp", test_files)
+      new_vector = deepcopy(vector)
+      del new_vector.get_value('exec_option')['abort_on_error']
+      self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
+                         new_vector, unique_database)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):