You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2019/12/09 14:24:35 UTC
[impala] 02/03: IMPALA-8184: Add timestamp validation to ORC scanner
This is an automated email from the ASF dual-hosted git repository.
csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit f33a9d0d426f2cbaaf225d7ea08b15966e537f31
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Dec 4 20:15:31 2019 +0100
IMPALA-8184: Add timestamp validation to ORC scanner
Hive can write timestamps that are outside Impala's valid
range (Impala: 1400-9999 Hive: 0001-9999). This change adds
validation logic to ORC reading that replaces out-of-range
timestamps with NULLs and adds a warning to the query.
The logic is very similar to the existing validation in
Parquet. Some differences:
- "time of day" is not checked separately as it doesn't make
sense with ORC's encoding
- instead of column name only column id is added to the warning
Testing:
- added a simple EE test that scans an existing ORC file
Change-Id: I8ee2ba83a54f93d37e8832e064f2c8418b503490
Reviewed-on: http://gerrit.cloudera.org:8080/14832
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/orc-column-readers.cc | 6 ++++++
common/thrift/generate_error_codes.py | 4 ++++
testdata/data/README | 6 +++++-
testdata/data/out_of_range_timestamp.orc | Bin 0 -> 229 bytes
.../DataErrorsTest/orc-out-of-range-timestamp.test | 16 ++++++++++++++++
tests/query_test/test_scanners.py | 11 +++++++++++
6 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 05e8ea2..69128b9 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -191,6 +191,12 @@ Status OrcTimestampReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
auto slot = reinterpret_cast<TimestampValue*>(GetSlot(tuple));
*slot = TimestampValue::FromUnixTimeNanos(secs, nanos,
scanner_->state_->local_time_zone());
+ if (UNLIKELY(!slot->HasDate())) {
+ SetNullSlot(tuple);
+ TErrorCode::type errorCode = TErrorCode::ORC_TIMESTAMP_OUT_OF_RANGE;
+ ErrorMsg msg(errorCode, scanner_->filename(), orc_column_id_);
+ return scanner_->state_->LogOrReturnError(msg);
+ }
return Status::OK();
}
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 8f0cbaa..3da3061 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -438,6 +438,10 @@ error_codes = (
("AVRO_INVALID_DATE", 144, "Avro file '$0' is corrupt: out of range date value $1 "
"at offset $2. The valid date range is -719162..2932896 (0001-01-01..9999-12-31)."),
+
+ ("ORC_TIMESTAMP_OUT_OF_RANGE", 145,
+ "ORC file '$0' column '$1' contains an out of range timestamp. "
+ "The valid date range is 1400-01-01..9999-12-31."),
)
import sys
diff --git a/testdata/data/README b/testdata/data/README
index bbffeb4..8e7e7a3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -450,4 +450,8 @@ relationships along with parent_table and child_table.
child_table:
Created manually. Contains four columns. 'seq' column is the primary key of this table. ('id', 'year') form a foreign key referring to parent_table('id', 'year') and 'a' is a
-foreign key referring to parent_table_2's primary column 'a'.
\ No newline at end of file
+foreign key referring to parent_table_2's primary column 'a'.
+
+out_of_range_timestamp.orc:
+Created with Hive. ORC file with a single timestamp column 'ts'.
+Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
diff --git a/testdata/data/out_of_range_timestamp.orc b/testdata/data/out_of_range_timestamp.orc
new file mode 100644
index 0000000..268b900
Binary files /dev/null and b/testdata/data/out_of_range_timestamp.orc differ
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
new file mode 100644
index 0000000..c39cd21
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
@@ -0,0 +1,16 @@
+====
+---- QUERY
+SET abort_on_error=1;
+SELECT * FROM out_of_range_timestamp;
+---- CATCH
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
+====
+---- QUERY
+SET abort_on_error=0;
+SELECT * FROM out_of_range_timestamp;
+---- TYPES
+TIMESTAMP
+---- RESULTS
+NULL
+---- ERRORS
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 5b17ecb..ea71911 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1323,6 +1323,17 @@ class TestOrc(ImpalaTestSuite):
self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
+ def test_orc_timestamp_out_of_range(self, vector, unique_database):
+ """Test the validation of out-of-range timestamps."""
+ test_files = ["testdata/data/out_of_range_timestamp.orc"]
+ create_table_and_copy_files(self.client, "create table {db}.{tbl} "
+ "(ts timestamp) stored as orc",
+ unique_database, "out_of_range_timestamp", test_files)
+ new_vector = deepcopy(vector)
+ del new_vector.get_value('exec_option')['abort_on_error']
+ self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
+ new_vector, unique_database)
+
class TestScannerReservation(ImpalaTestSuite):
@classmethod
def get_workload(self):