You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/11/29 06:41:56 UTC
[7/7] impala git commit: IMPALA-6187: Fix missing conjuncts evaluation with empty projection

IMPALA-6187: Fix missing conjuncts evaluation with empty projection

Previously, scanners will assume that there are no conjuncts associated
with a scan node for queries with no materialized slots (e.g. count(*)).
This is not necessarily the case as one can write queries such as
select count(*) from tpch.lineitem where rand() * 10 < 0; or
select count(*) from tpch.lineitem where rand() > <a partition column>.
In which case, the conjuncts should still be evaluated once per row.

This change fixes the problem in the short-circuit handling logic for
count(*) to evaluate the conjuncts once per row and only commits a row
to the output row batch if the conjuncts evaluate to true.

Testing done: Added the example above to the scanner test

Change-Id: Ib530f1fdcd2c6de699977db163b3f6eb38481517
Reviewed-on: http://gerrit.cloudera.org:8080/8623
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Reviewed-by: Alex Behm <al...@cloudera.com>
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/63f17e9c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/63f17e9c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/63f17e9c

Branch: refs/heads/master
Commit: 63f17e9ceaed92a28ea12567a36b746e54fffdb3
Parents: 2fba80e
Author: Michael Ho <kw...@cloudera.com>
Authored: Mon Nov 20 19:35:06 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Nov 29 05:53:15 2017 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |  2 +-
 be/src/exec/hdfs-scanner.cc                     | 25 +++++++++++++++-----
 be/src/exec/hdfs-scanner.h                      |  8 +++++--
 be/src/exec/hdfs-sequence-scanner.cc            |  2 +-
 be/src/exec/kudu-scanner.cc                     | 13 +++++++++-
 be/src/exec/kudu-scanner.h                      |  2 ++
 .../queries/QueryTest/kudu-scan-node.test       |  9 +++++++
 .../queries/QueryTest/scanners.test             | 24 +++++++++++++++++++
 8 files changed, 74 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index f407877..4e4abef 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -461,7 +461,7 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* row_batch) {
     Status status = CommitRows(row_batch, num_to_commit);
     assemble_rows_timer_.Stop();
     RETURN_IF_ERROR(status);
-    row_group_rows_read_ += num_to_commit;
+    row_group_rows_read_ += max_tuples;
     COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_);
     return Status::OK();
   }

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 0dbfc5f..f53b7d0 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -207,12 +207,25 @@ Status HdfsScanner::CommitRows(int num_rows, RowBatch* row_batch) {
 int HdfsScanner::WriteTemplateTuples(TupleRow* row, int num_tuples) {
   DCHECK_GE(num_tuples, 0);
   DCHECK_EQ(scan_node_->tuple_idx(), 0);
-  DCHECK_EQ(conjunct_evals_->size(), 0);
-  if (num_tuples == 0 || template_tuple_ == NULL) return num_tuples;
-
-  Tuple** row_tuple = reinterpret_cast<Tuple**>(row);
-  for (int i = 0; i < num_tuples; ++i) row_tuple[i] = template_tuple_;
-  return num_tuples;
+  DCHECK_EQ(scan_node_->materialized_slots().size(), 0);
+  int num_to_commit = 0;
+  if (LIKELY(conjunct_evals_->size() == 0)) {
+    num_to_commit = num_tuples;
+  } else {
+    TupleRow template_tuple_row;
+    template_tuple_row.SetTuple(0, template_tuple_);
+    // Evaluate any conjuncts which may reference the partition columns.
+    for (int i = 0; i < num_tuples; ++i) {
+      if (EvalConjuncts(&template_tuple_row)) ++num_to_commit;
+    }
+  }
+  if (template_tuple_ != nullptr) {
+    Tuple** row_tuple = reinterpret_cast<Tuple**>(row);
+    for (int i = 0; i < num_to_commit; ++i) row_tuple[i] = template_tuple_;
+  } else {
+    DCHECK_EQ(scan_node_->tuple_desc()->byte_size(), 0);
+  }
+  return num_to_commit;
 }
 
 bool HdfsScanner::WriteCompleteTuple(MemPool* pool, FieldLocation* fields,

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index c603593..9b80d6c 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -328,8 +328,12 @@ class HdfsScanner {
     return ExecNode::EvalConjuncts(conjunct_evals_->data(), conjunct_evals_->size(), row);
   }
 
-  /// Sets 'num_tuples' template tuples in the batch that 'row' points to. Assumes the
-  /// 'tuple_row' only has a single tuple. Returns the number of tuples set.
+  /// Handles the case when there are no slots materialized (e.g. count(*)) by adding
+  /// up to 'num_tuples' rows to the row batch which 'row' points to. Assumes each tuple
+  /// row only has one tuple. Set the added tuples in the row batch with the template
+  /// tuple if it's not NULL. In the rare case when there are conjuncts, evaluate them
+  /// once for each row and only add a row when they evaluate to true. Returns the number
+  /// of tuple rows added.
   int WriteTemplateTuples(TupleRow* row, int num_tuples);
 
   /// Processes batches of fields and writes them out to tuple_row_mem.

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-sequence-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc
index 67f598c..346a18a 100644
--- a/be/src/exec/hdfs-sequence-scanner.cc
+++ b/be/src/exec/hdfs-sequence-scanner.cc
@@ -341,7 +341,7 @@ Status HdfsSequenceScanner::ProcessRange(RowBatch* row_batch) {
         }
       }
     } else {
-      add_row = WriteTemplateTuples(tuple_row_mem, 1);
+      add_row = WriteTemplateTuples(tuple_row_mem, 1) > 0;
     }
     num_rows_read++;
     if (add_row) RETURN_IF_ERROR(CommitRows(1, row_batch));

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc
index 7db8878..a9b56fe 100644
--- a/be/src/exec/kudu-scanner.cc
+++ b/be/src/exec/kudu-scanner.cc
@@ -244,8 +244,19 @@ Status KuduScanner::HandleEmptyProjection(RowBatch* row_batch) {
   int num_rows_remaining = cur_kudu_batch_.NumRows() - cur_kudu_batch_num_read_;
   int rows_to_add = std::min(row_batch->capacity() - row_batch->num_rows(),
       num_rows_remaining);
+  int num_to_commit = 0;
+  if (LIKELY(conjunct_evals_.empty())) {
+    num_to_commit = rows_to_add;
+  } else {
+    for (int i = 0; i < rows_to_add; ++i) {
+      if (ExecNode::EvalConjuncts(conjunct_evals_.data(),
+              conjunct_evals_.size(), nullptr)) {
+        ++num_to_commit;
+      }
+    }
+  }
   cur_kudu_batch_num_read_ += rows_to_add;
-  row_batch->CommitRows(rows_to_add);
+  row_batch->CommitRows(num_to_commit);
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.h b/be/src/exec/kudu-scanner.h
index e6d4ca9..5617847 100644
--- a/be/src/exec/kudu-scanner.h
+++ b/be/src/exec/kudu-scanner.h
@@ -64,6 +64,8 @@ class KuduScanner {
  private:
   /// Handles the case where the projection is empty (e.g. count(*)).
   /// Does this by adding sets of rows to 'row_batch' instead of adding one-by-one.
+  /// If in the rare case where there is any conjunct, evaluate them once for each row
+  /// and add a row to the row batch only when the conjuncts evaluate to true.
   Status HandleEmptyProjection(RowBatch* row_batch);
 
   /// Decodes rows previously fetched from kudu, now in 'cur_rows_' into a RowBatch.

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
index 115affa..f32c8a1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test
@@ -140,3 +140,12 @@ order by id;
 ---- TYPES
 INT, TIMESTAMP
 ====
+---- QUERY
+# Regression test for IMPALA-6187. Make sure count(*) queries with partition columns only
+# won't miss conjuncts evaluation. 'id' is the partition column here.
+select count(*) from functional_kudu.alltypes where rand() + id < 0.0;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/scanners.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/scanners.test b/testdata/workloads/functional-query/queries/QueryTest/scanners.test
index 658e4cf..99ec5c5 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/scanners.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/scanners.test
@@ -74,3 +74,27 @@ from nulltable where b = ''
 ---- TYPES
 STRING, STRING
 ====
+---- QUERY
+# The following 3 tests are regression tests for IMPALA-6187. Make sure the conjuncts are
+# evaluated when there are no materialized slots or only partition columns are accessed.
+select count(*) from alltypes where rand() * 10 >= 0.0;
+---- RESULTS
+7300
+---- TYPES
+BIGINT
+====
+---- QUERY
+select count(*) from alltypes where rand() * 10 < 0.0;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====
+---- QUERY
+# 'year' and 'month' are partition columns.
+select count(*) from alltypes where rand() - year > month;
+---- RESULTS
+0
+---- TYPES
+BIGINT
+====