You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2019/12/09 14:24:33 UTC

[impala] branch master updated (17e534e -> 4093975)

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 17e534e  IMPALA-9126: part 4: hash join builder manages spilling
     new 85c9895  Update gitignore files
     new f33a9d0  IMPALA-8184: Add timestamp validation to ORC scanner
     new 4093975  Update mvn snapshot dependencies in "clean" checkouts

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .gitignore                                              |  12 ++++++++++--
 be/src/exec/orc-column-readers.cc                       |   6 ++++++
 buildall.sh                                             |   9 ++++-----
 common/thrift/generate_error_codes.py                   |   4 ++++
 fe/.gitignore                                           |   7 ++++++-
 testdata/data/README                                    |   6 +++++-
 testdata/data/out_of_range_timestamp.orc                | Bin 0 -> 229 bytes
 .../DataErrorsTest/orc-out-of-range-timestamp.test      |  16 ++++++++++++++++
 tests/query_test/test_scanners.py                       |  11 +++++++++++
 9 files changed, 62 insertions(+), 9 deletions(-)
 create mode 100644 testdata/data/out_of_range_timestamp.orc
 create mode 100644 testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test


[impala] 01/03: Update gitignore files

Posted by cs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 85c9895c119a0dcc5872dde29d8d8b1c243f7903
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Dec 5 13:38:21 2019 -0800

    Update gitignore files
    
    This adds in a handful of files that I had on my local machine
    
    Change-Id: I357441fab00ac031fbc70c40e4574e7a723fdedd
    Reviewed-on: http://gerrit.cloudera.org:8080/14858
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .gitignore    | 12 ++++++++++--
 fe/.gitignore |  7 ++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index ca47086..b208652 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,10 +16,10 @@ bin/impala-config-local.sh
 .cache
 .cdh
 .cdp
+.pep8
 
 # distcc options
-.impala_compiler_opts
-.impala_compiler_opts_v2
+.impala_compiler_opts*
 
 pprof.out
 
@@ -37,6 +37,8 @@ CTestTestfile.cmake
 .ninja_log
 build.ninja
 rules.ninja
+CMakeDoxyfile.in
+CMakeDoxygenDefaults.cmake
 
 # Build timestamp files
 .*timestamp
@@ -99,3 +101,9 @@ eclipse-classes/
 # Debugging
 ad-hoc.test
 nohup.out
+
+# Python virtualenvs
+gerrit_critic_venv
+
+# Output of single_node_perf_run.py
+perf_results/
diff --git a/fe/.gitignore b/fe/.gitignore
index fbade49..be8164b 100644
--- a/fe/.gitignore
+++ b/fe/.gitignore
@@ -19,8 +19,10 @@ target
 # emacs backup files
 *~
 
-# Generated hive-site.xml file
+# Generated hive-site.xml files
 src/test/resources/hive-site.xml
+src/test/resources/hive-site-ext/
+src/test/resources/hive-site_ext.xml
 
 # Generated hbase-site.xml file
 src/test/resources/hbase-site.xml
@@ -34,6 +36,9 @@ src/test/resources/minicluster-conf.xml
 # Generated hive-log4j2.properties file
 src/test/resources/hive-log4j2.properties
 
+# Generated authorization policy
+src/test/resources/authz-policy.ini
+
 # Generated thrift files
 generated-sources
 


[impala] 03/03: Update mvn snapshot dependencies in "clean" checkouts

Posted by cs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 4093975e2d5da61c1ef6217796a1543a602b1d40
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Thu Dec 5 20:18:56 2019 +0100

    Update mvn snapshot dependencies in "clean" checkouts
    
    buildall.sh saves the cdh/cdp version into .cdh/.cdp, and updates
    the dependencies if this doesn't match the version from config.
    
    This lead to updating the dependencies when switching to a different
    checkout in the same directory, but didn't do this in a fresh checkout,
    which could lead to build issues when the .m2 cache was dirty.
    
    Note that this doesn't protect from switching between Impala directories
    with different cdh/cdp versions.
    
    Change-Id: I8bbde17e7c97466391aa20ac3d59c6943e7f7256
    Reviewed-on: http://gerrit.cloudera.org:8080/14854
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Laszlo Gaal <la...@cloudera.com>
---
 buildall.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/buildall.sh b/buildall.sh
index 23ea8f9..ab7d415 100755
--- a/buildall.sh
+++ b/buildall.sh
@@ -547,11 +547,10 @@ bootstrap_dependencies
 # local cache.
 CDH_FILE="${IMPALA_HOME}/.cdh"
 CDP_FILE="${IMPALA_HOME}/.cdp"
-if [[ -f ${CDH_FILE} && -f ${CDP_FILE} ]]; then
-  if [[ $(cat ${CDH_FILE}) != ${CDH_BUILD_NUMBER} || \
-        $(cat ${CDP_FILE}) != ${CDP_BUILD_NUMBER} ]]; then
-    export IMPALA_MAVEN_OPTIONS="${IMPALA_MAVEN_OPTIONS} -U"
-  fi
+if [[ ! -f ${CDH_FILE} || ! -f ${CDP_FILE} || \
+      $(cat ${CDH_FILE}) != ${CDH_BUILD_NUMBER} || \
+      $(cat ${CDP_FILE}) != ${CDP_BUILD_NUMBER} ]]; then
+  export IMPALA_MAVEN_OPTIONS="${IMPALA_MAVEN_OPTIONS} -U"
 fi
 echo "${CDH_BUILD_NUMBER}" > ${CDH_FILE}
 echo "${CDP_BUILD_NUMBER}" > ${CDP_FILE}


[impala] 02/03: IMPALA-8184: Add timestamp validation to ORC scanner

Posted by cs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f33a9d0d426f2cbaaf225d7ea08b15966e537f31
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Dec 4 20:15:31 2019 +0100

    IMPALA-8184: Add timestamp validation to ORC scanner
    
    Hive can write timestamps that are outside Impala's valid
    range (Impala: 1400-9999 Hive: 0001-9999). This change adds
    validation logic to ORC reading that replaces out-of-range
    timestamps with NULLs and adds a warning to the query.
    
    The logic is very similar to the existing validation in
    Parquet. Some differences:
    - "time of day" is not checked separately as it doesn't make
      sense with ORC's encoding
    - instead of column name only column id is added to the warning
    
    Testing:
    - added a simple EE test that scans an existing ORC file
    
    Change-Id: I8ee2ba83a54f93d37e8832e064f2c8418b503490
    Reviewed-on: http://gerrit.cloudera.org:8080/14832
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/orc-column-readers.cc                       |   6 ++++++
 common/thrift/generate_error_codes.py                   |   4 ++++
 testdata/data/README                                    |   6 +++++-
 testdata/data/out_of_range_timestamp.orc                | Bin 0 -> 229 bytes
 .../DataErrorsTest/orc-out-of-range-timestamp.test      |  16 ++++++++++++++++
 tests/query_test/test_scanners.py                       |  11 +++++++++++
 6 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 05e8ea2..69128b9 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -191,6 +191,12 @@ Status OrcTimestampReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
   auto slot = reinterpret_cast<TimestampValue*>(GetSlot(tuple));
   *slot = TimestampValue::FromUnixTimeNanos(secs, nanos,
       scanner_->state_->local_time_zone());
+  if (UNLIKELY(!slot->HasDate())) {
+    SetNullSlot(tuple);
+    TErrorCode::type errorCode = TErrorCode::ORC_TIMESTAMP_OUT_OF_RANGE;
+    ErrorMsg msg(errorCode, scanner_->filename(), orc_column_id_);
+    return scanner_->state_->LogOrReturnError(msg);
+  }
   return Status::OK();
 }
 
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 8f0cbaa..3da3061 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -438,6 +438,10 @@ error_codes = (
 
   ("AVRO_INVALID_DATE", 144, "Avro file '$0' is corrupt: out of range date value $1 "
    "at offset $2. The valid date range is -719162..2932896 (0001-01-01..9999-12-31)."),
+
+  ("ORC_TIMESTAMP_OUT_OF_RANGE", 145,
+   "ORC file '$0' column '$1' contains an out of range timestamp. "
+   "The valid date range is 1400-01-01..9999-12-31."),
 )
 
 import sys
diff --git a/testdata/data/README b/testdata/data/README
index bbffeb4..8e7e7a3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -450,4 +450,8 @@ relationships along with parent_table and child_table.
 
 child_table:
 Created manually. Contains four columns. 'seq' column is the primary key of this table. ('id', 'year') form a foreign key referring to parent_table('id', 'year') and 'a' is a
-foreign key referring to parent_table_2's primary column 'a'.
\ No newline at end of file
+foreign key referring to parent_table_2's primary column 'a'.
+
+out_of_range_timestamp.orc:
+Created with Hive. ORC file with a single timestamp column 'ts'.
+Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
diff --git a/testdata/data/out_of_range_timestamp.orc b/testdata/data/out_of_range_timestamp.orc
new file mode 100644
index 0000000..268b900
Binary files /dev/null and b/testdata/data/out_of_range_timestamp.orc differ
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
new file mode 100644
index 0000000..c39cd21
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
@@ -0,0 +1,16 @@
+====
+---- QUERY
+SET abort_on_error=1;
+SELECT * FROM out_of_range_timestamp;
+---- CATCH
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
+====
+---- QUERY
+SET abort_on_error=0;
+SELECT * FROM out_of_range_timestamp;
+---- TYPES
+TIMESTAMP
+---- RESULTS
+NULL
+---- ERRORS
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 5b17ecb..ea71911 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1323,6 +1323,17 @@ class TestOrc(ImpalaTestSuite):
 
     self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
 
+  def test_orc_timestamp_out_of_range(self, vector, unique_database):
+      """Test the validation of out-of-range timestamps."""
+      test_files = ["testdata/data/out_of_range_timestamp.orc"]
+      create_table_and_copy_files(self.client, "create table {db}.{tbl} "
+                                               "(ts timestamp) stored as orc",
+                                  unique_database, "out_of_range_timestamp", test_files)
+      new_vector = deepcopy(vector)
+      del new_vector.get_value('exec_option')['abort_on_error']
+      self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
+                         new_vector, unique_database)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):