You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/07/06 23:50:49 UTC

[impala] branch master updated: IMPALA-9902: add rewrite of TPC-DS q38

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 24f24b1  IMPALA-9902: add rewrite of TPC-DS q38
24f24b1 is described below

commit 24f24b131f0c40efe2aaff3ebcff9fe1cdeb6c88
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Fri Jun 26 17:23:44 2020 -0700

    IMPALA-9902: add rewrite of TPC-DS q38
    
    I generated the query with dsqgen and then
    rewrote it to avoid intersect.
    
    Testing:
    Compared results to hive running the original version of the
    query.
    
    Change-Id: I81807683aa265a946729e15156bd2e33724103e1
    Reviewed-on: http://gerrit.cloudera.org:8080/16118
    Reviewed-by: Tim Armstrong <ta...@cloudera.com>
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../queries/tpcds-decimal_v2-q38-rewrite.test      | 33 ++++++++++++++++++++++
 testdata/workloads/tpcds/queries/tpcds-q8.test     |  4 ++-
 tests/query_test/test_tpcds_queries.py             |  3 ++
 tests/util/parse_util.py                           |  2 +-
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test
new file mode 100644
index 0000000..6c0878a
--- /dev/null
+++ b/testdata/workloads/tpcds/queries/tpcds-decimal_v2-q38-rewrite.test
@@ -0,0 +1,33 @@
+====
+---- QUERY: TPCDS-Q38-REWRITE
+-- This is an unofficial variant that is rewritten to use EXISTS subqueries instead of
+-- INTERSECT
+select count(*) from (
+    select distinct c_last_name, c_first_name, d_date
+    from store_sales, date_dim dd1, customer c1
+          where store_sales.ss_sold_date_sk = dd1.d_date_sk
+      and store_sales.ss_customer_sk = c1.c_customer_sk
+      and d_month_seq between 1189 and 1189 + 11
+      and exists (
+        select distinct c_last_name, c_first_name, d_date
+        from catalog_sales, date_dim dd2, customer c2
+              where catalog_sales.cs_sold_date_sk = dd2.d_date_sk
+          and catalog_sales.cs_bill_customer_sk = c2.c_customer_sk
+          and d_month_seq between 1189 and 1189 + 11
+          and c1.c_last_name <=> c2.c_last_name and c1.c_first_name <=> c2.c_first_name and dd1.d_date <=> dd2.d_date
+          )
+     and exists (
+        select distinct c_last_name, c_first_name, d_date
+        from web_sales, date_dim dd3, customer c3
+              where web_sales.ws_sold_date_sk = dd3.d_date_sk
+          and web_sales.ws_bill_customer_sk = c3.c_customer_sk
+          and d_month_seq between 1189 and 1189 + 11
+          and c1.c_last_name <=> c3.c_last_name and c1.c_first_name <=> c3.c_first_name and dd1.d_date <=> dd3.d_date
+     )
+) hot_cust
+limit 100
+---- RESULTS
+108
+---- TYPES
+BIGINT
+====
diff --git a/testdata/workloads/tpcds/queries/tpcds-q8.test b/testdata/workloads/tpcds/queries/tpcds-q8.test
index 91e41d2..ff4bcf5 100644
--- a/testdata/workloads/tpcds/queries/tpcds-q8.test
+++ b/testdata/workloads/tpcds/queries/tpcds-q8.test
@@ -1,5 +1,7 @@
 ====
 ---- QUERY: TPCDS-Q8
+-- This is an unofficial variant that is rewritten to use SEMI JOIN instead of
+-- INTERSECT.
 select
   s_store_name,
   sum(ss_net_profit)
@@ -71,4 +73,4 @@ limit 100
 ---- RESULTS
 ---- TYPES
 STRING, DECIMAL
-====
\ No newline at end of file
+====
diff --git a/tests/query_test/test_tpcds_queries.py b/tests/query_test/test_tpcds_queries.py
index b8a8504..18135fc 100644
--- a/tests/query_test/test_tpcds_queries.py
+++ b/tests/query_test/test_tpcds_queries.py
@@ -426,6 +426,9 @@ class TestTpcdsDecimalV2Query(ImpalaTestSuite):
   def test_tpcds_q37(self, vector):
     self.run_test_case(self.get_workload() + '-decimal_v2-q37', vector)
 
+  def test_tpcds_q38(self, vector):
+    self.run_test_case(self.get_workload() + '-decimal_v2-q38-rewrite', vector)
+
   def test_tpcds_q39_1(self, vector):
     self.run_test_case(self.get_workload() + '-decimal_v2-q39-1', vector)
 
diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py
index 448b6ec..c093a5d 100644
--- a/tests/util/parse_util.py
+++ b/tests/util/parse_util.py
@@ -22,7 +22,7 @@ from datetime import datetime
 # changed, and the stress test loses the ability to run the full set of queries. Set
 # these constants and assert that when a workload is used, all the queries we expect to
 # use are there.
-EXPECTED_TPCDS_QUERIES_COUNT = 85
+EXPECTED_TPCDS_QUERIES_COUNT = 86
 EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22
 EXPECTED_TPCH_QUERIES_COUNT = 22
 # Add the number of stress test specific queries, i.e. in files like '*-stress-*.test'