You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2018/04/27 22:18:30 UTC

impala git commit: IMPALA-6821: Push down limits into Kudu

Repository: impala
Updated Branches:
  refs/heads/master 1e79f1479 -> 87be63e32


IMPALA-6821: Push down limits into Kudu

This patch takes advantage of a recent change in Kudu (KUDU-16) that
exposes the ability to set limits on KuduScanners. Since each
KuduScanner corresponds to a scan token, and there will be multiple
scan tokens per query, this is just a performance optimization in
cases where the limit is smaller than the number of rows per token,
and Impala still needs to apply the limit on our side for cases where
the limit is greater than the number of rows per token.

Testing:
- Added e2e tests for various situations where limits are applied at
  a Kudu scan node.
- For the query 'select * from tpch_kudu.lineitem limit 1', a best
  case perf scenario for this change where the limit is highly
  effective, the time spent in the Kudu scan node was reduced from
  6.107ms to 3.498ms (avg over 3 runs).
- For the query 'select count(*) from (select * from
  tpch_kudu.lineitem limit 1000000) v', a worst case perf scenario for
  this change where the limit is ineffective, the time spent in the
  Kudu scan node was essentially unchanged, 32.815ms previously vs.
  29.532ms (avg over 3 runs).

Change-Id: Ibe35e70065d8706b575e24fe20902cd405b49941
Reviewed-on: http://gerrit.cloudera.org:8080/10119
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/87be63e3
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/87be63e3
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/87be63e3

Branch: refs/heads/master
Commit: 87be63e321f688486b98d4ea69200967a8a2effa
Parents: 1e79f14
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Fri Apr 13 20:38:16 2018 +0000
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Apr 27 21:55:11 2018 +0000

----------------------------------------------------------------------
 be/src/exec/kudu-scanner.cc                     |   5 +
 .../queries/QueryTest/kudu_limit.test           | 112 +++++++++++++++++++
 tests/query_test/test_kudu.py                   |   3 +
 3 files changed, 120 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/87be63e3/be/src/exec/kudu-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc
index 3bc4441..9e90bdb 100644
--- a/be/src/exec/kudu-scanner.cc
+++ b/be/src/exec/kudu-scanner.cc
@@ -226,6 +226,11 @@ Status KuduScanner::OpenNextScanToken(const string& scan_token, bool* eos) {
     }
   }
 
+  if (scan_node_->limit() != -1 && conjunct_evals_.empty()) {
+    KUDU_RETURN_IF_ERROR(scanner_->SetLimit(scan_node_->limit()),
+        "Failed to set limit on scan.");
+  }
+
   {
     SCOPED_TIMER(state_->total_storage_wait_timer());
     KUDU_RETURN_IF_ERROR(scanner_->Open(), "Unable to open scanner");

http://git-wip-us.apache.org/repos/asf/impala/blob/87be63e3/testdata/workloads/functional-query/queries/QueryTest/kudu_limit.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/kudu_limit.test b/testdata/workloads/functional-query/queries/QueryTest/kudu_limit.test
new file mode 100644
index 0000000..c72afc9
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/kudu_limit.test
@@ -0,0 +1,112 @@
+====
+---- QUERY
+# limit 0
+select * from functional_kudu.alltypes limit 0
+---- RESULTS
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# no predicate, nondeterministic (no order by) so only check number of rows returned
+select count(*) from (select * from functional_kudu.alltypes limit 2) v
+---- RESULTS
+2
+---- TYPES
+bigint
+====
+---- QUERY
+# no predicate, deterministic (limit doesn't exclude any rows) so check actual values
+select * from functional_kudu.alltypestiny limit 100
+---- RESULTS : VERIFY_IS_EQUAL_SORTED
+0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
+2,true,0,0,0,0,0,0,'02/01/09','0',2009-02-01 00:00:00,2009,2
+4,true,0,0,0,0,0,0,'03/01/09','0',2009-03-01 00:00:00,2009,3
+1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
+5,false,1,1,1,10,1.100000023841858,10.1,'03/01/09','1',2009-03-01 00:01:00,2009,3
+6,true,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00,2009,4
+7,false,1,1,1,10,1.100000023841858,10.1,'04/01/09','1',2009-04-01 00:01:00,2009,4
+3,false,1,1,1,10,1.100000023841858,10.1,'02/01/09','1',2009-02-01 00:01:00,2009,2
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# Kudu predicate on PK col, nondeterministic
+select count(id) from (select * from functional_kudu.alltypes where id > 0 limit 3) v
+---- RESULTS
+3
+---- TYPES
+bigint
+====
+---- QUERY
+# Kudu predicate on PK col, deterministic
+select * from functional_kudu.alltypestiny where id > 4 limit 3
+---- RESULTS : VERIFY_IS_EQUAL_SORTED
+5,false,1,1,1,10,1.100000023841858,10.1,'03/01/09','1',2009-03-01 00:01:00,2009,3
+6,true,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00,2009,4
+7,false,1,1,1,10,1.100000023841858,10.1,'04/01/09','1',2009-04-01 00:01:00,2009,4
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# Kudu predicate on non-PK col, nondeterministic
+select count(tinyint_col) from (select * from functional_kudu.alltypes where tinyint_col = 6 limit 4) v
+---- RESULTS
+4
+---- TYPES
+bigint
+====
+---- QUERY
+# Kudu predicate on non-PK col, deterministic
+select * from functional_kudu.alltypestiny where tinyint_col = 1 limit 4
+---- RESULTS : VERIFY_IS_EQUAL_SORTED
+1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
+5,false,1,1,1,10,1.100000023841858,10.1,'03/01/09','1',2009-03-01 00:01:00,2009,3
+7,false,1,1,1,10,1.100000023841858,10.1,'04/01/09','1',2009-04-01 00:01:00,2009,4
+3,false,1,1,1,10,1.100000023841858,10.1,'02/01/09','1',2009-02-01 00:01:00,2009,2
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# Impala predicate, nondeterministic
+select count(string_col) from (select * from functional_kudu.alltypes where id % 2 = 0 limit 2) v
+---- RESULTS
+2
+---- TYPES
+bigint
+====
+---- QUERY
+# Impala predicate, deterministic
+select * from functional_kudu.alltypestiny where id % 2 = 0 limit 100
+---- RESULTS : VERIFY_IS_EQUAL_SORTED
+0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
+2,true,0,0,0,0,0,0,'02/01/09','0',2009-02-01 00:00:00,2009,2
+4,true,0,0,0,0,0,0,'03/01/09','0',2009-03-01 00:00:00,2009,3
+6,true,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00,2009,4
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# Both Impala and Kudu predicates, nondeterministic
+select count(month) from
+  (select * from functional_kudu.alltypes where id % 2 = 0 and id > 4 limit 5) v
+---- RESULTS
+5
+---- TYPES
+bigint
+====
+---- QUERY
+# Both Impala and Kudu predicates, deterministic
+select * from functional_kudu.alltypestiny where id % 2 = 0 and id > 4 limit 5
+---- RESULTS
+6,true,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00,2009,4
+---- TYPES
+int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp,int,int
+====
+---- QUERY
+# large limit, nondeterministic
+select count(*) from (select * from functional_kudu.alltypes where id % 2 = 0 and id > 1 limit 1000) v;
+---- RESULTS
+1000
+---- TYPES
+bigint
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/87be63e3/tests/query_test/test_kudu.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_kudu.py b/tests/query_test/test_kudu.py
index b0ced46..ae76f75 100644
--- a/tests/query_test/test_kudu.py
+++ b/tests/query_test/test_kudu.py
@@ -107,6 +107,9 @@ class TestKuduOperations(KuduTestSuite):
   def test_kudu_describe(self, vector, unique_database):
     self.run_test_case('QueryTest/kudu_describe', vector, use_db=unique_database)
 
+  def test_kudu_limit(self, vector, unique_database):
+    self.run_test_case('QueryTest/kudu_limit', vector, use_db=unique_database)
+
   def test_kudu_column_options(self, cursor, kudu_client, unique_database):
     """Test Kudu column options"""
     encodings = ["ENCODING PLAIN_ENCODING", ""]