You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/11/29 06:41:56 UTC
[7/7] impala git commit: IMPALA-6187: Fix missing conjuncts
evaluation with empty projection
IMPALA-6187: Fix missing conjuncts evaluation with empty projection
Previously, scanners will assume that there are no conjuncts associated
with a scan node for queries with no materialized slots (e.g. count(*)).
This is not necessarily the case as one can write queries such as
select count(*) from tpch.lineitem where rand() * 10 < 0; or
select count(*) from tpch.lineitem where rand() > <a partition column>.
In which case, the conjuncts should still be evaluated once per row.
This change fixes the problem in the short-circuit handling logic for
count(*) to evaluate the conjuncts once per row and only commits a row
to the output row batch if the conjuncts evaluate to true.
Testing done: Added the example above to the scanner test
Change-Id: Ib530f1fdcd2c6de699977db163b3f6eb38481517
Reviewed-on: http://gerrit.cloudera.org:8080/8623
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Reviewed-by: Alex Behm <al...@cloudera.com>
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/63f17e9c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/63f17e9c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/63f17e9c
Branch: refs/heads/master
Commit: 63f17e9ceaed92a28ea12567a36b746e54fffdb3
Parents: 2fba80e
Author: Michael Ho <kw...@cloudera.com>
Authored: Mon Nov 20 19:35:06 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Nov 29 05:53:15 2017 +0000
----------------------------------------------------------------------
be/src/exec/hdfs-parquet-scanner.cc | 2 +-
be/src/exec/hdfs-scanner.cc | 25 +++++++++++++++-----
be/src/exec/hdfs-scanner.h | 8 +++++--
be/src/exec/hdfs-sequence-scanner.cc | 2 +-
be/src/exec/kudu-scanner.cc | 13 +++++++++-
be/src/exec/kudu-scanner.h | 2 ++
.../queries/QueryTest/kudu-scan-node.test | 9 +++++++
.../queries/QueryTest/scanners.test | 24 +++++++++++++++++++
8 files changed, 74 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index f407877..4e4abef 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -461,7 +461,7 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* row_batch) {
Status status = CommitRows(row_batch, num_to_commit);
assemble_rows_timer_.Stop();
RETURN_IF_ERROR(status);
- row_group_rows_read_ += num_to_commit;
+ row_group_rows_read_ += max_tuples;
COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 0dbfc5f..f53b7d0 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -207,12 +207,25 @@ Status HdfsScanner::CommitRows(int num_rows, RowBatch* row_batch) {
int HdfsScanner::WriteTemplateTuples(TupleRow* row, int num_tuples) {
DCHECK_GE(num_tuples, 0);
DCHECK_EQ(scan_node_->tuple_idx(), 0);
- DCHECK_EQ(conjunct_evals_->size(), 0);
- if (num_tuples == 0 || template_tuple_ == NULL) return num_tuples;
-
- Tuple** row_tuple = reinterpret_cast<Tuple**>(row);
- for (int i = 0; i < num_tuples; ++i) row_tuple[i] = template_tuple_;
- return num_tuples;
+ DCHECK_EQ(scan_node_->materialized_slots().size(), 0);
+ int num_to_commit = 0;
+ if (LIKELY(conjunct_evals_->size() == 0)) {
+ num_to_commit = num_tuples;
+ } else {
+ TupleRow template_tuple_row;
+ template_tuple_row.SetTuple(0, template_tuple_);
+ // Evaluate any conjuncts which may reference the partition columns.
+ for (int i = 0; i < num_tuples; ++i) {
+ if (EvalConjuncts(&template_tuple_row)) ++num_to_commit;
+ }
+ }
+ if (template_tuple_ != nullptr) {
+ Tuple** row_tuple = reinterpret_cast<Tuple**>(row);
+ for (int i = 0; i < num_to_commit; ++i) row_tuple[i] = template_tuple_;
+ } else {
+ DCHECK_EQ(scan_node_->tuple_desc()->byte_size(), 0);
+ }
+ return num_to_commit;
}
bool HdfsScanner::WriteCompleteTuple(MemPool* pool, FieldLocation* fields,
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index c603593..9b80d6c 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -328,8 +328,12 @@ class HdfsScanner {
return ExecNode::EvalConjuncts(conjunct_evals_->data(), conjunct_evals_->size(), row);
}
- /// Sets 'num_tuples' template tuples in the batch that 'row' points to. Assumes the
- /// 'tuple_row' only has a single tuple. Returns the number of tuples set.
+ /// Handles the case when there are no slots materialized (e.g. count(*)) by adding
+ /// up to 'num_tuples' rows to the row batch which 'row' points to. Assumes each tuple
+ /// row only has one tuple. Set the added tuples in the row batch with the template
+ /// tuple if it's not NULL. In the rare case when there are conjuncts, evaluate them
+ /// once for each row and only add a row when they evaluate to true. Returns the number
+ /// of tuple rows added.
int WriteTemplateTuples(TupleRow* row, int num_tuples);
/// Processes batches of fields and writes them out to tuple_row_mem.
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-sequence-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc
index 67f598c..346a18a 100644
--- a/be/src/exec/hdfs-sequence-scanner.cc
+++ b/be/src/exec/hdfs-sequence-scanner.cc
@@ -341,7 +341,7 @@ Status HdfsSequenceScanner::ProcessRange(RowBatch* row_batch) {
}
}
} else {
- add_row = WriteTemplateTuples(tuple_row_mem, 1);
+ add_row = WriteTemplateTuples(tuple_row_mem, 1) > 0;
}
num_rows_read++;
if (add_row) RETURN_IF_ERROR(CommitRows(1, row_batch));
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc
index 7db8878..a9b56fe 100644
--- a/be/src/exec/kudu-scanner.cc
+++ b/be/src/exec/kudu-scanner.cc
@@ -244,8 +244,19 @@ Status KuduScanner::HandleEmptyProjection(RowBatch* row_batch) {
int num_rows_remaining = cur_kudu_batch_.NumRows() - cur_kudu_batch_num_read_;
int rows_to_add = std::min(row_batch->capacity() - row_batch->num_rows(),
num_rows_remaining);
+ int num_to_commit = 0;
+ if (LIKELY(conjunct_evals_.empty())) {
+ num_to_commit = rows_to_add;
+ } else {
+ for (int i = 0; i < rows_to_add; ++i) {
+ if (ExecNode::EvalConjuncts(conjunct_evals_.data(),
+ conjunct_evals_.size(), nullptr)) {
+ ++num_to_commit;
+ }
+ }
+ }
cur_kudu_batch_num_read_ += rows_to_add;
- row_batch->CommitRows(rows_to_add);
+ row_batch->CommitRows(num_to_commit);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.h b/be/src/exec/kudu-scanner.h
index e6d4ca9..5617847 100644
--- a/be/src/exec/kudu-scanner.h
+++ b/be/src/exec/kudu-scanner.h
@@ -64,6 +64,8 @@ class KuduScanner {
private:
/// Handles the case where the projection is empty (e.g. count(*)).
/// Does this by adding sets of rows to 'row_batch' instead of adding one-by-one.
+ /// If in the rare case where there is any conjunct, evaluate them once for each row
+ /// and add a row to the row batch only when the conjuncts evaluate to true.
Status HandleEmptyProjection(RowBatch* row_batch);
/// Decodes rows previously fetched from kudu, now in 'cur_rows_' into a RowBatch.
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
index 115affa..f32c8a1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
@@ -140,3 +140,12 @@ order by id;
---- TYPES
INT, TIMESTAMP
====
+---- QUERY
+# Regression test for IMPALA-6187. Make sure count(*) queries with partition columns only
+# won't miss conjuncts evaluation. 'id' is the partition column here.
+select count(*) from functional_kudu.alltypes where rand() + id < 0.0;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/scanners.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/scanners.test b/testdata/workloads/functional-query/queries/QueryTest/scanners.test
index 658e4cf..99ec5c5 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/scanners.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/scanners.test
@@ -74,3 +74,27 @@ from nulltable where b = ''
---- TYPES
STRING, STRING
====
+---- QUERY
+# The following 3 tests are regression tests for IMPALA-6187. Make sure the conjuncts are
+# evaluated when there are no materialized slots or only partition columns are accessed.
+select count(*) from alltypes where rand() * 10 >= 0.0;
+---- RESULTS
+7300
+---- TYPES
+BIGINT
+====
+---- QUERY
+select count(*) from alltypes where rand() * 10 < 0.0;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====
+---- QUERY
+# 'year' and 'month' are partition columns.
+select count(*) from alltypes where rand() - year > month;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====