You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by jo...@apache.org on 2021/05/07 23:52:18 UTC

[impala] branch master updated (603091e -> e265434)

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 603091e  IMPALA-10692: Fix acid insert when event polling is enabled
     new 9253e0a  IMPALA-10683: Skip test 'test_double_precision' for non-HDFS test env
     new e265434  IMPALA-9967: Add support for reading ORC's TIMESTAMP WITH LOCAL TIMEZONE

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/exec/orc-column-readers.cc                     |   2 +-
 be/src/exec/orc-column-readers.h                      |   7 ++++++-
 be/src/exec/orc-metadata-utils.cc                     |   1 +
 bin/impala-config.sh                                  |   4 ++--
 .../apache/impala/analysis/OrcSchemaExtractor.java    |   8 ++++++--
 java/shaded-deps/hive-exec/pom.xml                    |   1 -
 testdata/data/README                                  |   4 ++++
 testdata/data/timestamp_with_local_timezone.orc       | Bin 0 -> 541 bytes
 .../queries/QueryTest/create-table-like-file-orc.test |  12 ++++++++++++
 .../QueryTest/orc_timestamp_with_local_timezone.test  |  17 +++++++++++++++++
 tests/common/file_utils.py                            |  18 ++++++++++++++----
 tests/metadata/test_ddl.py                            |   3 +++
 tests/query_test/test_insert_parquet.py               |  11 +++++++++++
 tests/query_test/test_scanners.py                     |  11 ++++++++++-
 14 files changed, 87 insertions(+), 12 deletions(-)
 create mode 100644 testdata/data/timestamp_with_local_timezone.orc
 create mode 100644 testdata/workloads/functional-query/queries/QueryTest/orc_timestamp_with_local_timezone.test

[impala] 02/02: IMPALA-9967: Add support for reading ORC's TIMESTAMP WITH LOCAL TIMEZONE

Posted by jo...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e26543426c8b7cc86fc0d9f60c53c7a7fd7bc8a8
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Mon Apr 26 17:06:49 2021 +0200

    IMPALA-9967: Add support for reading ORC's TIMESTAMP WITH LOCAL TIMEZONE
    
    ORC-189 and ORC-666 added support for a new timestamp type
    'TIMESTMAP WITH LOCAL TIMEZONE' to the Orc library.
    
    This patch adds support for reading such timestamps with Impala.
    These are UTC-normalized timestamps, therefore we convert them
    to local timezone during scanning.
    
    Testing:
     * added test for CREATE TABLE LIKE ORC
     * added scanner tests to test_scanners.py
    
    Change-Id: Icb0c6a43ebea21f1cba5b8f304db7c4bd43967d9
    Reviewed-on: http://gerrit.cloudera.org:8080/17347
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/orc-column-readers.cc                     |   2 +-
 be/src/exec/orc-column-readers.h                      |   7 ++++++-
 be/src/exec/orc-metadata-utils.cc                     |   1 +
 bin/impala-config.sh                                  |   4 ++--
 .../apache/impala/analysis/OrcSchemaExtractor.java    |   8 ++++++--
 java/shaded-deps/hive-exec/pom.xml                    |   1 -
 testdata/data/README                                  |   4 ++++
 testdata/data/timestamp_with_local_timezone.orc       | Bin 0 -> 541 bytes
 .../queries/QueryTest/create-table-like-file-orc.test |  12 ++++++++++++
 .../QueryTest/orc_timestamp_with_local_timezone.test  |  17 +++++++++++++++++
 tests/common/file_utils.py                            |  18 ++++++++++++++----
 tests/metadata/test_ddl.py                            |   3 +++
 tests/query_test/test_scanners.py                     |  11 ++++++++++-
 13 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 0b22f71..8bfa039 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -225,7 +225,7 @@ Status OrcTimestampReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
   int64_t secs = batch_->data.data()[row_idx];
   int64_t nanos = batch_->nanoseconds.data()[row_idx];
   auto slot = reinterpret_cast<TimestampValue*>(GetSlot(tuple));
-  *slot = TimestampValue::FromUnixTimeNanos(secs, nanos, UTCPTR);
+  *slot = TimestampValue::FromUnixTimeNanos(secs, nanos, timezone_);
   if (UNLIKELY(!slot->HasDate())) {
     SetNullSlot(tuple);
     TErrorCode::type errorCode = TErrorCode::ORC_TIMESTAMP_OUT_OF_RANGE;
diff --git a/be/src/exec/orc-column-readers.h b/be/src/exec/orc-column-readers.h
index 8b8ce73..30fb0cc 100644
--- a/be/src/exec/orc-column-readers.h
+++ b/be/src/exec/orc-column-readers.h
@@ -370,7 +370,11 @@ class OrcTimestampReader : public OrcPrimitiveColumnReader<OrcTimestampReader> {
  public:
   OrcTimestampReader(const orc::Type* node, const SlotDescriptor* slot_desc,
       HdfsOrcScanner* scanner)
-      : OrcPrimitiveColumnReader<OrcTimestampReader>(node, slot_desc, scanner) { }
+      : OrcPrimitiveColumnReader<OrcTimestampReader>(node, slot_desc, scanner) {
+    if (node->getKind() == orc::TIMESTAMP_INSTANT) {
+      timezone_ = scanner_->state_->local_time_zone();
+    }
+  }
 
   virtual ~OrcTimestampReader() { }
 
@@ -386,6 +390,7 @@ class OrcTimestampReader : public OrcPrimitiveColumnReader<OrcTimestampReader> {
   friend class OrcPrimitiveColumnReader<OrcTimestampReader>;
 
   orc::TimestampVectorBatch* batch_ = nullptr;
+  const Timezone* timezone_ = UTCPTR;
 };
 
 class OrcDateColumnReader : public OrcPrimitiveColumnReader<OrcDateColumnReader> {
diff --git a/be/src/exec/orc-metadata-utils.cc b/be/src/exec/orc-metadata-utils.cc
index b198bae..49ec782 100644
--- a/be/src/exec/orc-metadata-utils.cc
+++ b/be/src/exec/orc-metadata-utils.cc
@@ -219,6 +219,7 @@ Status OrcSchemaResolver::ValidateType(const ColumnType& type,
       }
       break;
     case orc::TypeKind::TIMESTAMP:
+    case orc::TypeKind::TIMESTAMP_INSTANT:
       if (type.type == TYPE_TIMESTAMP) return Status::OK();
       break;
     case orc::TypeKind::DECIMAL: {
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index ff87bd7..89a2793 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -68,7 +68,7 @@ fi
 # moving to a different build of the toolchain, e.g. when a version is bumped or a
 # compile option is changed. The build id can be found in the output of the toolchain
 # build jobs, it is constructed from the build number and toolchain git hash prefix.
-export IMPALA_TOOLCHAIN_BUILD_ID=10-425e7fd862
+export IMPALA_TOOLCHAIN_BUILD_ID=15-4a1bf8d9bc
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p5
@@ -132,7 +132,7 @@ export IMPALA_OPENLDAP_VERSION=2.4.47
 unset IMPALA_OPENLDAP_URL
 export IMPALA_OPENSSL_VERSION=1.0.2l
 unset IMPALA_OPENSSL_URL
-export IMPALA_ORC_VERSION=1.6.2-p7
+export IMPALA_ORC_VERSION=1.6.2-p11
 unset IMPALA_ORC_URL
 export IMPALA_PROTOBUF_VERSION=3.5.1
 unset IMPALA_PROTOBUF_URL
diff --git a/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java b/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java
index 9515ff4..27d8c16 100644
--- a/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java
+++ b/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java
@@ -73,7 +73,8 @@ public class OrcSchemaExtractor {
    */
   static private Type convertPrimitiveOrcType(TypeDescription type) {
     Category category = type.getCategory();
-    Preconditions.checkState(category.isPrimitive());
+    Preconditions.checkState(category.isPrimitive() ||
+                             category.equals(Category.TIMESTAMP_INSTANT)); // ORC-790
     switch (category) {
       case BINARY: return Type.STRING;
       case BOOLEAN: return Type.BOOLEAN;
@@ -89,6 +90,7 @@ public class OrcSchemaExtractor {
       case SHORT: return Type.SMALLINT;
       case STRING: return Type.STRING;
       case TIMESTAMP: return Type.TIMESTAMP;
+      case TIMESTAMP_INSTANT: return Type.TIMESTAMP;
       case VARCHAR: return ScalarType.createVarcharType(type.getMaxLength());
       default:
         Preconditions.checkState(false,
@@ -168,7 +170,9 @@ public class OrcSchemaExtractor {
    * Converts an ORC type to an Impala Type.
    */
   static private Type convertOrcType(TypeDescription type) throws AnalysisException {
-    if (type.getCategory().isPrimitive()) {
+    Category category = type.getCategory();
+    // TIMESTAMP_INSTANT is wrongly defined as a compound type (ORC-790).
+    if (category.isPrimitive() || category.equals(Category.TIMESTAMP_INSTANT)) {
       return convertPrimitiveOrcType(type);
     } else {
       return convertComplexOrcType(type);
diff --git a/java/shaded-deps/hive-exec/pom.xml b/java/shaded-deps/hive-exec/pom.xml
index 95030c6..8667862 100644
--- a/java/shaded-deps/hive-exec/pom.xml
+++ b/java/shaded-deps/hive-exec/pom.xml
@@ -120,7 +120,6 @@ the same dependencies
                 <include>org/apache/hive/service/rpc/thrift/**</include>
                 <include>org/apache/hive/common/HiveVersionAnnotation.class</include>
                 <include>org/apache/hadoop/hive/ql/ErrorMsg.class</include>
-                <include>org/apache/orc/**</include>
                 <include>com/google/**</include>
                 <!-- IMPALA-10261: Some versions of Hive shade guava, so include
                  the shaded path as well -->
diff --git a/testdata/data/README b/testdata/data/README
index ca27f24..1663af9 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -629,3 +629,7 @@ Started impala with -write_new_parquet_dictionary_encodings=true
   set max_fs_writers=1;
   create table att stored as parquet as
   select * from functional_parquet.alltypestiny;
+
+timestamp_with_local_timezone.orc:
+ORC file that contains column with type 'timestamp with local timezone'.
+Generated by Spark/Iceberg.
diff --git a/testdata/data/timestamp_with_local_timezone.orc b/testdata/data/timestamp_with_local_timezone.orc
new file mode 100644
index 0000000..c64fb35
Binary files /dev/null and b/testdata/data/timestamp_with_local_timezone.orc differ
diff --git a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test
index 85d3f1a..cf9bc90 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test
@@ -129,3 +129,15 @@ describe transactional_complextypes_clone
 ---- TYPES
 STRING, STRING, STRING
 ====
+---- QUERY
+create table timestamp_with_local_timezone_2 like ORC
+'$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/timestamp_with_local_timezone/timestamp_with_local_timezone.orc';
+describe timestamp_with_local_timezone_2
+---- RESULTS
+'id','int','Inferred from ORC file.'
+'user','string','Inferred from ORC file.'
+'action','string','Inferred from ORC file.'
+'event_time','timestamp','Inferred from ORC file.'
+---- TYPES
+STRING, STRING, STRING
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/orc_timestamp_with_local_timezone.test b/testdata/workloads/functional-query/queries/QueryTest/orc_timestamp_with_local_timezone.test
new file mode 100644
index 0000000..438a4db
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/orc_timestamp_with_local_timezone.test
@@ -0,0 +1,17 @@
+====
+---- QUERY
+set timezone=CET;
+SELECT * FROM timestamp_with_local_timezone;
+---- RESULTS
+1,'Alex','view',2020-01-01 09:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+set timezone="US/Hawaii";
+SELECT * FROM timestamp_with_local_timezone;
+---- RESULTS
+1,'Alex','view',2019-12-31 22:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
diff --git a/tests/common/file_utils.py b/tests/common/file_utils.py
index e91befa..11ad839 100644
--- a/tests/common/file_utils.py
+++ b/tests/common/file_utils.py
@@ -29,7 +29,17 @@ from tests.util.filesystem_utils import get_fs_path
 def create_table_from_parquet(impala_client, unique_database, table_name):
   """Utility function to create a database table from a Parquet file. A Parquet file must
   exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet"""
-  filename = '{0}.parquet'.format(table_name)
+  create_table_from_file(impala_client, unique_database, table_name, 'parquet')
+
+
+def create_table_from_orc(impala_client, unique_database, table_name):
+  """Utility function to create a database table from a Orc file. A Orc file must
+  exist in $IMPALA_HOME/testdata/data with the name 'table_name'.orc"""
+  create_table_from_file(impala_client, unique_database, table_name, 'orc')
+
+
+def create_table_from_file(impala_client, unique_database, table_name, file_format):
+  filename = '{0}.{1}'.format(table_name, file_format)
   local_file = os.path.join(os.environ['IMPALA_HOME'],
                             'testdata/data/{0}'.format(filename))
   assert os.path.isfile(local_file)
@@ -38,15 +48,15 @@ def create_table_from_parquet(impala_client, unique_database, table_name):
   tbl_dir = get_fs_path('/test-warehouse/{0}.db/{1}'.format(unique_database, table_name))
   check_call(['hdfs', 'dfs', '-mkdir', '-p', tbl_dir])
 
-  # Put the parquet file in the table's directory
+  # Put the file into the table's directory
   # Note: -d skips a staging copy
   check_call(['hdfs', 'dfs', '-put', '-f', '-d', local_file, tbl_dir])
 
   # Create the table
   hdfs_file = '{0}/{1}'.format(tbl_dir, filename)
   qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
-  impala_client.execute('create table {0} like parquet "{1}" stored as parquet'.format(
-    qualified_table_name, hdfs_file))
+  impala_client.execute('create table {0} like {1} "{2}" stored as {1}'.format(
+      qualified_table_name, file_format, hdfs_file))
 
 
 def create_table_and_copy_files(impala_client, create_stmt, unique_database, table_name,
diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py
index 419e36c..6518388 100644
--- a/tests/metadata/test_ddl.py
+++ b/tests/metadata/test_ddl.py
@@ -24,6 +24,7 @@ import time
 from test_ddl_base import TestDdlBase
 from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.environ import (HIVE_MAJOR_VERSION)
+from tests.common.file_utils import create_table_from_orc
 from tests.common.impala_test_suite import LOG
 from tests.common.parametrize import UniqueDatabase
 from tests.common.skip import (SkipIf, SkipIfABFS, SkipIfADLS, SkipIfKudu, SkipIfLocal,
@@ -306,6 +307,8 @@ class TestDdlStatements(TestDdlBase):
     bucket_file = filter(lambda s: s.startswith('bucket'),
       self.filesystem_client.ls(COMPLEXTYPETBL_PATH + base_dir))[0]
     vector.get_value('exec_option')['abort_on_error'] = False
+    create_table_from_orc(self.client, unique_database,
+        'timestamp_with_local_timezone')
     self.run_test_case('QueryTest/create-table-like-file-orc', vector,
         use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector),
         test_file_vars={
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 44d540c..6b4a83c 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -31,7 +31,6 @@ from parquet.ttypes import ConvertedType
 from subprocess import check_call
 
 from testdata.common import widetable
-from tests.common.file_utils import create_table_and_copy_files
 from tests.common.impala_test_suite import ImpalaTestSuite, LOG
 from tests.common.skip import (
     SkipIf,
@@ -1581,6 +1580,16 @@ class TestOrc(ImpalaTestSuite):
       self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
                          new_vector, unique_database)
 
+  def test_orc_timestamp_with_local_timezone(self, vector, unique_database):
+      """Test scanning of ORC file that contains 'timstamp with local timezone'."""
+      test_files = ["testdata/data/timestamp_with_local_timezone.orc"]
+      create_table_and_copy_files(self.client,
+          "create table {db}.{tbl} "
+          "(id int, user string, action string, event_time timestamp) "
+          "stored as orc", unique_database, "timestamp_with_local_timezone", test_files)
+      self.run_test_case("QueryTest/orc_timestamp_with_local_timezone", vector,
+          unique_database)
+
   def _run_invalid_schema_test(self, unique_database, test_name, expected_error):
     """Copies 'test_name'.orc to a table and runs a simple query. These tests should
        cause an error during the processing of the ORC schema, so the file's columns do

[impala] 01/02: IMPALA-10683: Skip test 'test_double_precision' for non-HDFS test env

Posted by jo...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 9253e0a9e7751a471ddfde754767ca8f56b14ec4
Author: Amogh Margoor <am...@cloudera.com>
AuthorDate: Tue May 4 02:48:26 2021 -0700

    IMPALA-10683: Skip test 'test_double_precision' for non-HDFS test env
    
    TestHdfsParquetTableWriter.test_double_parser uses Hive statement.
    IMPALA-9365 describes why HS2 is not started on non-HDFS test env
    and therefore we need to skip this test for such test environments.
    Hive statement is being used as Impala's result are converted
    by python to string. In both HS2 and beewax, it only handles float
    precision uptil 16 decimal digits and test needs 17.
    
    Change-Id: I1a3225ba563ec4d0514b489c3a1daf8291ca1445
    Reviewed-on: http://gerrit.cloudera.org:8080/17397
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/query_test/test_insert_parquet.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/query_test/test_insert_parquet.py b/tests/query_test/test_insert_parquet.py
index 5faf31b..1a050e6 100644
--- a/tests/query_test/test_insert_parquet.py
+++ b/tests/query_test/test_insert_parquet.py
@@ -533,6 +533,17 @@ class TestHdfsParquetTableWriter(ImpalaTestSuite):
     self._ctas_and_check_int64_timestamps(vector, unique_database, tmpdir, "micros")
     self._ctas_and_check_int64_timestamps(vector, unique_database, tmpdir, "nanos")
 
+  # Skip test for non-HDFS environment as it uses Hive statement.
+  # Hive statement is being used as Impala's result are converted
+  # by python to string. In both HS2 and beewax, it only handles float
+  # precision uptil 16 decimal digits and test needs 17.
+  # IMPALA-9365 describes why HS2 is not started on non-HDFS test env.
+  @SkipIfS3.hive
+  @SkipIfGCS.hive
+  @SkipIfABFS.hive
+  @SkipIfADLS.hive
+  @SkipIfIsilon.hive
+  @SkipIfLocal.hive
   def test_double_precision(self, vector, unique_database):
     # IMPALA-10654: Test inserting double into Parquet table retains the precision.
     src_tbl = "{0}.{1}".format(unique_database, "i10654_parquet")