You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2022/02/23 22:58:02 UTC
[impala] 01/02: IMPALA-11137: Enable proleptic Gregorian Calendar for Hive
This is an automated email from the ASF dual-hosted git repository.
csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 331ff4647db9611dc64365be6e2a8a18c4307815
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sun Feb 20 16:12:39 2022 +0800
IMPALA-11137: Enable proleptic Gregorian Calendar for Hive
Since HIVE-22589, Hive still uses Julian Calendar for writing dates
before 1582-10-15, whereas Impala uses proleptic Gregorian Calendar.
This affects the results Impala gets when querying tables written by
Hive. Currently, the Avro and ORC formats of date_tbl are suffering this
issue.
This patch enables proleptic Gregorian Calendar for Hive by default.
It also reverts the two commits of IMPALA-9555 which modifies the tests
to satisfy the inconsistent results.
Tests:
- Ran CORE tests
Change-Id: I6be9c9720dd352d6821cdaa6c64d35ba20473bc0
Reviewed-on: http://gerrit.cloudera.org:8080/18262
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
fe/src/test/resources/hive-site.xml.py | 14 +++++-
.../queries/QueryTest/avro_date.test | 51 +++++++++-------------
.../queries/QueryTest/orc-stats.test | 6 +--
tests/query_test/test_date_queries.py | 3 +-
4 files changed, 36 insertions(+), 38 deletions(-)
diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index b67a162..80c758b 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -109,7 +109,7 @@ if kerberize:
# hive.metastore.kerberos.keytab.file
# hive.metastore.kerberos.principal
-# Enable Tez and ACID for Hive 3
+# Enable Tez, ACID and proleptic Gregorian calendar DATE types for Hive 3
if hive_major_version >= 3:
CONFIG.update({
'hive.tez.container.size': '512',
@@ -155,7 +155,17 @@ if hive_major_version >= 3:
# Due to HIVE-23102 Hive will wait for at least this amount of time for compactions
# (the default value is 5 mins which is way too long). Setting it to 2 seconds.
- 'hive.compactor.wait.timeout': '2000'
+ 'hive.compactor.wait.timeout': '2000',
+
+ # Since HIVE-22589, Hive uses Julian Calendar for writing dates before 1582-10-15,
+ # whereas Impala uses proleptic Gregorian Calendar. This affects the results Impala
+ # gets when querying tables written by Hive.
+ 'hive.avro.proleptic.gregorian': 'true',
+ 'hive.avro.proleptic.gregorian.default': 'true',
+ 'hive.parquet.date.proleptic.gregorian': 'true',
+ 'hive.parquet.date.proleptic.gregorian.default': 'true',
+ 'orc.proleptic.gregorian': 'true',
+ 'orc.proleptic.gregorian.default': 'true'
})
else:
CONFIG.update({
diff --git a/testdata/workloads/functional-query/queries/QueryTest/avro_date.test b/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
index 8ebf6bf..317e58c 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/avro_date.test
@@ -33,30 +33,32 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
---- HIVE_MAJOR_VERSION
3
---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
select id_col, date_part, date_col from date_tbl;
---- RESULTS
+0,0001-01-01,0001-01-01
+1,0001-01-01,0001-12-31
+2,0001-01-01,0002-01-01
+3,0001-01-01,1399-12-31
+4,0001-01-01,2017-11-28
+5,0001-01-01,9999-12-31
+6,0001-01-01,NULL
10,1399-06-27,2017-11-28
11,1399-06-27,NULL
12,1399-06-27,2018-12-31
-20,2017-11-27,0001-06-19
-21,2017-11-27,0001-06-20
-22,2017-11-27,0001-06-21
-23,2017-11-27,0001-06-22
-24,2017-11-27,0001-06-23
-25,2017-11-27,0001-06-24
-26,2017-11-27,0001-06-25
-27,2017-11-27,0001-06-26
-28,2017-11-27,0001-06-27
+20,2017-11-27,0001-06-21
+21,2017-11-27,0001-06-22
+22,2017-11-27,0001-06-23
+23,2017-11-27,0001-06-24
+24,2017-11-27,0001-06-25
+25,2017-11-27,0001-06-26
+26,2017-11-27,0001-06-27
+27,2017-11-27,0001-06-28
+28,2017-11-27,0001-06-29
29,2017-11-27,2017-11-28
30,9999-12-31,9999-12-01
31,9999-12-31,9999-12-31
---- TYPES
INT,DATE,DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
====
---- HIVE_MAJOR_VERSION
2
@@ -78,18 +80,14 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
---- HIVE_MAJOR_VERSION
3
---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
select date_part, count(date_col) from date_tbl group by date_part;
---- RESULTS
2017-11-27,10
1399-06-27,2
9999-12-31,2
+0001-01-01,6
---- TYPES
DATE, BIGINT
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
====
---- HIVE_MAJOR_VERSION
2
@@ -109,16 +107,11 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
---- HIVE_MAJOR_VERSION
3
---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
select min(date_part), max(date_part), min(date_col), max(date_col) from date_tbl;
---- RESULTS
-1399-06-27,9999-12-31,0001-06-19,9999-12-31
+0001-01-01,9999-12-31,0001-01-01,9999-12-31
---- TYPES
DATE, DATE, DATE, DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
====
---- HIVE_MAJOR_VERSION
2
@@ -140,18 +133,14 @@ Avro file '__HDFS_FILENAME__' is corrupt: out of range date value -719164 at off
---- HIVE_MAJOR_VERSION
3
---- QUERY
-# Avro table was created by Hive3. Since HIVE-22589 Hive3 also uses Julian calendar for
-# dates before 1582-10-15 by default, therefore we expect the same results as above.
select date_part, min(date_col), max(date_col) from date_tbl group by date_part;
---- RESULTS
-2017-11-27,0001-06-19,2017-11-28
+2017-11-27,0001-06-21,2017-11-28
1399-06-27,2017-11-28,2018-12-31
9999-12-31,9999-12-01,9999-12-31
+0001-01-01,0001-01-01,9999-12-31
---- TYPES
DATE, DATE, DATE
----- ERRORS
-row_regex: .*Problem parsing file __HDFS_FILENAME__ at \d+
-row_regex: .*Avro file '__HDFS_FILENAME__' is corrupt: out of range date value \-?\d+ at offset \d+\. The valid date range is \-719162\.\.2932896 \(0001\-01\-01\.\.9999\-12\-31\)\.
====
---- QUERY
select date_part, count(*) from date_tbl group by date_part;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test b/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
index c878a29..fbe24d4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/orc-stats.test
@@ -518,7 +518,7 @@ aggregation(SUM, RowsRead): 310
====
---- QUERY
select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-19';
+where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-21';
---- RESULTS
0
---- RUNTIME_PROFILE
@@ -526,7 +526,7 @@ aggregation(SUM, RowsRead): 0
====
---- QUERY
select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-19';
+where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-21';
---- RESULTS
1
---- RUNTIME_PROFILE
@@ -725,7 +725,7 @@ aggregation(SUM, RowsRead): 0
====
---- QUERY
select count(*) from functional_orc_def.date_tbl
-where date_part in ("2017-11-27", "1399-06-27") and date_col in ('0001-06-19', '2018-12-31');
+where date_part in ("2017-11-27", "1399-06-27") and date_col in ('0001-06-21', '2018-12-31');
---- RESULTS
2
---- RUNTIME_PROFILE
diff --git a/tests/query_test/test_date_queries.py b/tests/query_test/test_date_queries.py
index f6e5d6e..a2cc112 100644
--- a/tests/query_test/test_date_queries.py
+++ b/tests/query_test/test_date_queries.py
@@ -57,8 +57,7 @@ class TestDateQueries(ImpalaTestSuite):
# - Hive2 uses Julian Calendar for writing dates before 1582-10-15, whereas Impala
# uses proleptic Gregorian Calendar. This affects the results Impala gets when
# querying avro tables written by Hive2.
- # - Since HIVE-22589, Hive3 also uses Julian Calendar for dates before 1582-10-15
- # by default.
+ # - Hive3 on the other hand uses proleptic Gregorian Calendar to write dates.
self.run_test_case('QueryTest/avro_date', vector)
else:
self.run_test_case('QueryTest/date', vector)