You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2022/02/23 22:58:02 UTC

[impala] 01/02: IMPALA-11137: Enable proleptic Gregorian Calendar for Hive

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 331ff4647db9611dc64365be6e2a8a18c4307815
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sun Feb 20 16:12:39 2022 +0800

    IMPALA-11137: Enable proleptic Gregorian Calendar for Hive
    
    Since HIVE-22589, Hive still uses Julian Calendar for writing dates
    before 1582-10-15, whereas Impala uses proleptic Gregorian Calendar.
    This affects the results Impala gets when querying tables written by
    Hive. Currently, the Avro and ORC formats of date_tbl are suffering this
    issue.
    
    This patch enables proleptic Gregorian Calendar for Hive by default.
    It also reverts the two commits of IMPALA-9555 which modifies the tests
    to satisfy the inconsistent results.
    
    Tests:
     - Ran CORE tests
    
    Change-Id: I6be9c9720dd352d6821cdaa6c64d35ba20473bc0
    Reviewed-on: http://gerrit.cloudera.org:8080/18262
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 fe/src/test/resources/hive-site.xml.py             | 14 +++++-
 .../queries/QueryTest/avro_date.test               | 51 +++++++++-------------
 .../queries/QueryTest/orc-stats.test               |  6 +--
 tests/query_test/test_date_queries.py              |  3 +-
 4 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index b67a162..80c758b 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -109,7 +109,7 @@ if kerberize:
   #   hive.metastore.kerberos.keytab.file
   #   hive.metastore.kerberos.principal
 
-# Enable Tez and ACID for Hive 3
+# Enable Tez, ACID and proleptic Gregorian calendar DATE types for Hive 3
 if hive_major_version >= 3:
   CONFIG.update({
    'hive.tez.container.size': '512',
@@ -155,7 +155,17 @@ if hive_major_version >= 3:
 
    # Due to HIVE-23102 Hive will wait for at least this amount of time for compactions
    # (the default value is 5 mins which is way too long). Setting it to 2 seconds.
-   'hive.compactor.wait.timeout': '2000'
+   'hive.compactor.wait.timeout': '2000',
+
+   # Since HIVE-22589, Hive uses Julian Calendar for writing dates before 1582-10-15,
+   # whereas Impala uses proleptic Gregorian Calendar. This affects the results Impala
+   # gets when querying tables written by Hive.
+   'hive.avro.proleptic.gregorian': 'true',
+   'hive.avro.proleptic.gregorian.default': 'true',
+   'hive.parquet.date.proleptic.gregorian': 'true',
+   'hive.parquet.date.proleptic.gregorian.default': 'true',
+   'orc.proleptic.gregorian': 'true',
+   'orc.proleptic.gregorian.default': 'true'
   })
 else:
   CONFIG.update({
diff --git a/testdata/workloads/functional-query/queries/QueryTest/avro_date.test b/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
index 8ebf6bf..317e58c 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
@@ -33,30 +33,32 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
 ---- HIVE_MAJOR_VERSION
 3
 ---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
 select id_col, date_part, date_col from date_tbl;
 ---- RESULTS
+0,0001-01-01,0001-01-01
+1,0001-01-01,0001-12-31
+2,0001-01-01,0002-01-01
+3,0001-01-01,1399-12-31
+4,0001-01-01,2017-11-28
+5,0001-01-01,9999-12-31
+6,0001-01-01,NULL
 10,1399-06-27,2017-11-28
 11,1399-06-27,NULL
 12,1399-06-27,2018-12-31
-20,2017-11-27,0001-06-19
-21,2017-11-27,0001-06-20
-22,2017-11-27,0001-06-21
-23,2017-11-27,0001-06-22
-24,2017-11-27,0001-06-23
-25,2017-11-27,0001-06-24
-26,2017-11-27,0001-06-25
-27,2017-11-27,0001-06-26
-28,2017-11-27,0001-06-27
+20,2017-11-27,0001-06-21
+21,2017-11-27,0001-06-22
+22,2017-11-27,0001-06-23
+23,2017-11-27,0001-06-24
+24,2017-11-27,0001-06-25
+25,2017-11-27,0001-06-26
+26,2017-11-27,0001-06-27
+27,2017-11-27,0001-06-28
+28,2017-11-27,0001-06-29
 29,2017-11-27,2017-11-28
 30,9999-12-31,9999-12-01
 31,9999-12-31,9999-12-31
 ---- TYPES
 INT,DATE,DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
 ====
 ---- HIVE_MAJOR_VERSION
 2
@@ -78,18 +80,14 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
 ---- HIVE_MAJOR_VERSION
 3
 ---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
 select date_part, count(date_col) from date_tbl group by date_part;
 ---- RESULTS
 2017-11-27,10
 1399-06-27,2
 9999-12-31,2
+0001-01-01,6
 ---- TYPES
 DATE, BIGINT
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
 ====
 ---- HIVE_MAJOR_VERSION
 2
@@ -109,16 +107,11 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
 ---- HIVE_MAJOR_VERSION
 3
 ---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
 select min(date_part), max(date_part), min(date_col), max(date_col) from date_tbl;
 ---- RESULTS
-1399-06-27,9999-12-31,0001-06-19,9999-12-31
+0001-01-01,9999-12-31,0001-01-01,9999-12-31
 ---- TYPES
 DATE, DATE, DATE, DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
 ====
 ---- HIVE_MAJOR_VERSION
 2
@@ -140,18 +133,14 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
 ---- HIVE_MAJOR_VERSION
 3
 ---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
 select date_part, min(date_col), max(date_col) from date_tbl group by date_part;
 ---- RESULTS
-2017-11-27,0001-06-19,2017-11-28
+2017-11-27,0001-06-21,2017-11-28
 1399-06-27,2017-11-28,2018-12-31
 9999-12-31,9999-12-01,9999-12-31
+0001-01-01,0001-01-01,9999-12-31
 ---- TYPES
 DATE, DATE, DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
 ====
 ---- QUERY
 select date_part, count(*) from date_tbl group by date_part;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test b/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
index c878a29..fbe24d4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
@@ -518,7 +518,7 @@ aggregation(SUM, RowsRead): 310
 ====
 ---- QUERY
 select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-19';
+where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-21';
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
@@ -526,7 +526,7 @@ aggregation(SUM, RowsRead): 0
 ====
 ---- QUERY
 select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-19';
+where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-21';
 ---- RESULTS
 1
 ---- RUNTIME_PROFILE
@@ -725,7 +725,7 @@ aggregation(SUM, RowsRead): 0
 ====
 ---- QUERY
 select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col in ('0001-06-19', '2018-12-31');
+where date_part in ("2017-11-27", "1399-06-27") and date_col in ('0001-06-21', '2018-12-31');
 ---- RESULTS
 2
 ---- RUNTIME_PROFILE
diff --git a/tests/query_test/test_date_queries.py b/tests/query_test/test_date_queries.py
index f6e5d6e..a2cc112 100644
--- a/tests/query_test/test_date_queries.py
+++ b/tests/query_test/test_date_queries.py
@@ -57,8 +57,7 @@ class TestDateQueries(ImpalaTestSuite):
       #  - Hive2 uses Julian Calendar for writing dates before 1582-10-15, whereas Impala
       #    uses proleptic Gregorian Calendar. This affects the results Impala gets when
       #    querying avro tables written by Hive2.
-      #  - Since HIVE-22589, Hive3 also uses Julian Calendar for dates before 1582-10-15
-      #    by default.
+      #  - Hive3 on the other hand uses proleptic Gregorian Calendar to write dates.
       self.run_test_case('QueryTest/avro_date', vector)
     else:
       self.run_test_case('QueryTest/date', vector)