You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/07/06 23:50:49 UTC
[impala] branch master updated: IMPALA-9902: add rewrite of TPC-DS
q38
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 24f24b1 IMPALA-9902: add rewrite of TPC-DS q38
24f24b1 is described below
commit 24f24b131f0c40efe2aaff3ebcff9fe1cdeb6c88
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Fri Jun 26 17:23:44 2020 -0700
IMPALA-9902: add rewrite of TPC-DS q38
I generated the query with dsqgen and then
rewrote it to avoid intersect.
Testing:
Compared results to hive running the original version of the
query.
Change-Id: I81807683aa265a946729e15156bd2e33724103e1
Reviewed-on: http://gerrit.cloudera.org:8080/16118
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../queries/tpcds-decimal_v2-q38-rewrite.test | 33 ++++++++++++++++++++++
testdata/workloads/tpcds/queries/tpcds-q8.test | 4 ++-
tests/query_test/test_tpcds_queries.py | 3 ++
tests/util/parse_util.py | 2 +-
4 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test
new file mode 100644
index 0000000..6c0878a
--- /dev/null
+++ b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test
@@ -0,0 +1,33 @@
+====
+---- QUERY: TPCDS-Q38-REWRITE
+-- This is an unofficial variant that is rewritten to use EXISTS subqueries instead of
+-- INTERSECT
+select count(*) from (
+ select distinct c_last_name, c_first_name, d_date
+ from store_sales, date_dim dd1, customer c1
+ where store_sales.ss_sold_date_sk = dd1.d_date_sk
+ and store_sales.ss_customer_sk = c1.c_customer_sk
+ and d_month_seq between 1189 and 1189 + 11
+ and exists (
+ select distinct c_last_name, c_first_name, d_date
+ from catalog_sales, date_dim dd2, customer c2
+ where catalog_sales.cs_sold_date_sk = dd2.d_date_sk
+ and catalog_sales.cs_bill_customer_sk = c2.c_customer_sk
+ and d_month_seq between 1189 and 1189 + 11
+ and c1.c_last_name <=> c2.c_last_name and c1.c_first_name <=> c2.c_first_name and dd1.d_date <=> dd2.d_date
+ )
+ and exists (
+ select distinct c_last_name, c_first_name, d_date
+ from web_sales, date_dim dd3, customer c3
+ where web_sales.ws_sold_date_sk = dd3.d_date_sk
+ and web_sales.ws_bill_customer_sk = c3.c_customer_sk
+ and d_month_seq between 1189 and 1189 + 11
+ and c1.c_last_name <=> c3.c_last_name and c1.c_first_name <=> c3.c_first_name and dd1.d_date <=> dd3.d_date
+ )
+) hot_cust
+limit 100
+---- RESULTS
+108
+---- TYPES
+BIGINT
+====
diff --git a/testdata/workloads/tpcds/queries/tpcds-q8.test b/testdata/workloads/tpcds/queries/tpcds-q8.test
index 91e41d2..ff4bcf5 100644
--- a/testdata/workloads/tpcds/queries/tpcds-q8.test
+++ b/testdata/workloads/tpcds/queries/tpcds-q8.test
@@ -1,5 +1,7 @@
====
---- QUERY: TPCDS-Q8
+-- This is an unofficial variant that is rewritten to use SEMI JOIN instead of
+-- INTERSECT.
select
s_store_name,
sum(ss_net_profit)
@@ -71,4 +73,4 @@ limit 100
---- RESULTS
---- TYPES
STRING, DECIMAL
-====
\ No newline at end of file
+====
diff --git a/tests/query_test/test_tpcds_queries.py b/tests/query_test/test_tpcds_queries.py
index b8a8504..18135fc 100644
--- a/tests/query_test/test_tpcds_queries.py
+++ b/tests/query_test/test_tpcds_queries.py
@@ -426,6 +426,9 @@ class TestTpcdsDecimalV2Query(ImpalaTestSuite):
def test_tpcds_q37(self, vector):
self.run_test_case(self.get_workload() + '-decimal_v2-q37', vector)
+ def test_tpcds_q38(self, vector):
+ self.run_test_case(self.get_workload() + '-decimal_v2-q38-rewrite', vector)
+
def test_tpcds_q39_1(self, vector):
self.run_test_case(self.get_workload() + '-decimal_v2-q39-1', vector)
diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py
index 448b6ec..c093a5d 100644
--- a/tests/util/parse_util.py
+++ b/tests/util/parse_util.py
@@ -22,7 +22,7 @@ from datetime import datetime
# changed, and the stress test loses the ability to run the full set of queries. Set
# these constants and assert that when a workload is used, all the queries we expect to
# use are there.
-EXPECTED_TPCDS_QUERIES_COUNT = 85
+EXPECTED_TPCDS_QUERIES_COUNT = 86
EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22
EXPECTED_TPCH_QUERIES_COUNT = 22
# Add the number of stress test specific queries, i.e. in files like '*-stress-*.test'