You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/11/23 04:19:26 UTC
[2/5] incubator-impala git commit: IMPALA-4985: use parquet stats of
nested types for dynamic pruning
IMPALA-4985: use parquet stats of nested types for dynamic pruning
Currently, parquet row-groups can be pruned at run-time using
min/max stats when predicates (in, binary) are specified for
column scalar types. This patch extends pruning to nested types
for the same class of predicates. A nested value is an instance
of a nested type (struct, array, map). A nested value consists of
other nested and scalar values (as declared by its type).
Predicates that can be used for row-group pruning must be applied to
nested scalar values. In addition, the parent of the nested scalar
must also be required, that is, not empty. The latter requirement
is conservative: some filters that could be used for pruning are
not used for correctness reasons.
Testing:
- extended nested-types-parquet-stats e2e test cases.
Change-Id: I0c99e20cb080b504442cd5376ea3e046016158fe
Reviewed-on: http://gerrit.cloudera.org:8080/8480
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/21a96ed2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/21a96ed2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/21a96ed2
Branch: refs/heads/master
Commit: 21a96ed2e39537db39cda14ebdacc83a8c4c89f3
Parents: 223707d
Author: Vuk Ercegovac <ve...@cloudera.com>
Authored: Wed Nov 1 10:35:43 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Nov 22 22:00:16 2017 +0000
----------------------------------------------------------------------
be/src/exec/hdfs-parquet-scanner.h | 2 +-
.../impala/analysis/CollectionStructType.java | 1 +
.../org/apache/impala/analysis/SelectStmt.java | 8 +
.../org/apache/impala/analysis/SlotRef.java | 20 ++
.../org/apache/impala/planner/HdfsScanNode.java | 82 ++++--
.../queries/PlannerTest/constant-folding.test | 2 +-
.../queries/PlannerTest/mt-dop-validation.test | 6 +-
.../queries/PlannerTest/parquet-filtering.test | 275 ++++++++++++++++++
.../QueryTest/nested-types-parquet-stats.test | 288 ++++++++++++++++++-
tests/query_test/test_nested_types.py | 3 -
10 files changed, 649 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/be/src/exec/hdfs-parquet-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h
index 0eea458..99b5a60 100644
--- a/be/src/exec/hdfs-parquet-scanner.h
+++ b/be/src/exec/hdfs-parquet-scanner.h
@@ -188,7 +188,7 @@ class BoolColumnReader;
/// 0), but we don't distinguish between these two cases yet.
/// TODO: fix this (IMPALA-2272)
///
-/// The column readers that materialize this structure form a tree analagous to the
+/// The column readers that materialize this structure form a tree analogous to the
/// materialized output:
/// CollectionColumnReader slot_id=0 node="repeated group list (d=2 r=1)"
/// CollectionColumnReader slot_id=1 node="repeated group list (d=4 r=2)"
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/fe/src/main/java/org/apache/impala/analysis/CollectionStructType.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/CollectionStructType.java b/fe/src/main/java/org/apache/impala/analysis/CollectionStructType.java
index a982774..d05a6b2 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CollectionStructType.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CollectionStructType.java
@@ -75,4 +75,5 @@ public class CollectionStructType extends StructType {
public StructField getOptionalField() { return optionalField_; }
public boolean isMapStruct() { return isMapStruct_; }
+ public boolean isArrayStruct() { return !isMapStruct_; }
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
index 068e09a..2ba5105 100644
--- a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
@@ -291,6 +291,14 @@ public class SelectStmt extends QueryStmt {
* - collection table ref represents the rhs of an inner/cross/semi join
* - collection table ref's parent tuple is not outer joined
*
+ * Example: table T has field A which is of type array<array<int>>.
+ * 1) ... T join T.A a join a.item a_nest ... : all nodes on the path T -> a -> a_nest
+ * are required so are checked for !empty.
+ * 2) ... T left outer join T.A a join a.item a_nest ... : no !empty.
+ * 3) ... T join T.A a left outer join a.item a_nest ... : a checked for !empty.
+ * 4) ... T left outer join T.A a left outer join a.item a_nest ... : no !empty.
+ *
+ *
* TODO: In some cases, it is possible to generate !empty() predicates for a correlated
* table ref, but in general, that is not correct for non-trivial query blocks.
* For example, if the block with the correlated ref has an aggregation then adding a
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/fe/src/main/java/org/apache/impala/analysis/SlotRef.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/SlotRef.java b/fe/src/main/java/org/apache/impala/analysis/SlotRef.java
index f4b2144..b4505ba 100644
--- a/fe/src/main/java/org/apache/impala/analysis/SlotRef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/SlotRef.java
@@ -150,6 +150,26 @@ public class SlotRef extends Expr {
return "<slot " + Integer.toString(desc_.getId().asInt()) + ">";
}
+ /**
+ * Checks if this slotRef refers to an array "pos" pseudo-column.
+ *
+ * Note: checking whether the column is null distinguishes between top-level columns
+ * and nested types. This check more specifically looks just for a reference to the
+ * "pos" field of an array type.
+ */
+ public boolean isArrayPosRef() {
+ TupleDescriptor parent = getDesc().getParent();
+ if (parent == null) return false;
+ Type parentType = parent.getType();
+ if (parentType instanceof CollectionStructType) {
+ if (((CollectionStructType)parentType).isArrayStruct() &&
+ getDesc().getLabel().equals(Path.ARRAY_POS_FIELD_NAME)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
@Override
protected void toThrift(TExprNode msg) {
msg.node_type = TExprNodeType.SLOT_REF;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 44f58eb..1e0dd72 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -37,6 +37,7 @@ import org.apache.impala.analysis.FunctionCallExpr;
import org.apache.impala.analysis.FunctionName;
import org.apache.impala.analysis.FunctionParams;
import org.apache.impala.analysis.InPredicate;
+import org.apache.impala.analysis.IsNotEmptyPredicate;
import org.apache.impala.analysis.LiteralExpr;
import org.apache.impala.analysis.NullLiteral;
import org.apache.impala.analysis.SlotDescriptor;
@@ -173,6 +174,10 @@ public class HdfsScanNode extends ScanNode {
private final Map<TupleDescriptor, List<Expr>> collectionConjuncts_ =
Maps.newLinkedHashMap();
+ // TupleDescriptors of collection slots that have an IsNotEmptyPredicate. See
+ // SelectStmt#registerIsNotEmptyPredicates.
+ private final Set<TupleDescriptor> notEmptyCollections_ = Sets.newHashSet();
+
// Map from SlotIds to indices in PlanNodes.conjuncts_ that are eligible for
// dictionary filtering
private Map<Integer, List<Integer>> dictionaryFilterConjuncts_ =
@@ -398,6 +403,7 @@ public class HdfsScanNode extends ScanNode {
*/
private void assignCollectionConjuncts(Analyzer analyzer) {
collectionConjuncts_.clear();
+ addNotEmptyCollections(conjuncts_);
assignCollectionConjuncts(desc_, analyzer);
}
@@ -426,13 +432,13 @@ public class HdfsScanNode extends ScanNode {
// We only support slot refs on the left hand side of the predicate, a rewriting
// rule makes sure that all compatible exprs are rewritten into this form. Only
// implicit casts are supported.
- SlotRef slot = binaryPred.getChild(0).unwrapSlotRef(true);
- if (slot == null) return;
+ SlotRef slotRef = binaryPred.getChild(0).unwrapSlotRef(true);
+ if (slotRef == null) return;
// This node is a table scan, so this must be a scanning slot.
- Preconditions.checkState(slot.getDesc().isScanSlot());
- // If the column is null, then this can be a 'pos' scanning slot of a nested type.
- if (slot.getDesc().getColumn() == null) return;
+ Preconditions.checkState(slotRef.getDesc().isScanSlot());
+ // Skip the slot ref if it refers to an array's "pos" field.
+ if (slotRef.isArrayPosRef()) return;
Expr constExpr = binaryPred.getChild(1);
// Only constant exprs can be evaluated against parquet::Statistics. This includes
@@ -444,24 +450,23 @@ public class HdfsScanNode extends ScanNode {
if (op == BinaryPredicate.Operator.LT || op == BinaryPredicate.Operator.LE ||
op == BinaryPredicate.Operator.GE || op == BinaryPredicate.Operator.GT) {
minMaxOriginalConjuncts_.add(binaryPred);
- buildStatsPredicate(analyzer, slot, binaryPred, op);
+ buildStatsPredicate(analyzer, slotRef, binaryPred, op);
} else if (op == BinaryPredicate.Operator.EQ) {
minMaxOriginalConjuncts_.add(binaryPred);
// TODO: this could be optimized for boolean columns.
- buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.LE);
- buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.GE);
+ buildStatsPredicate(analyzer, slotRef, binaryPred, BinaryPredicate.Operator.LE);
+ buildStatsPredicate(analyzer, slotRef, binaryPred, BinaryPredicate.Operator.GE);
}
}
private void tryComputeInListMinMaxPredicate(Analyzer analyzer, InPredicate inPred) {
- // Retrieve the left side of the IN predicate. It must be a simple slot to
- // proceed.
- SlotRef slot = inPred.getBoundSlot();
- if (slot == null) return;
+ // Retrieve the left side of the IN predicate. It must be a simple slot to proceed.
+ SlotRef slotRef = inPred.getBoundSlot();
+ if (slotRef == null) return;
// This node is a table scan, so this must be a scanning slot.
- Preconditions.checkState(slot.getDesc().isScanSlot());
- // If the column is null, then this can be a 'pos' scanning slot of a nested type.
- if (slot.getDesc().getColumn() == null) return;
+ Preconditions.checkState(slotRef.getDesc().isScanSlot());
+ // Skip the slot ref if it refers to an array's "pos" field.
+ if (slotRef.isArrayPosRef()) return;
if (inPred.isNotIn()) return;
ArrayList<Expr> children = inPred.getChildren();
@@ -488,8 +493,30 @@ public class HdfsScanNode extends ScanNode {
children.get(0).clone(), max.clone());
minMaxOriginalConjuncts_.add(inPred);
- buildStatsPredicate(analyzer, slot, minBound, minBound.getOp());
- buildStatsPredicate(analyzer, slot, maxBound, maxBound.getOp());
+ buildStatsPredicate(analyzer, slotRef, minBound, minBound.getOp());
+ buildStatsPredicate(analyzer, slotRef, maxBound, maxBound.getOp());
+ }
+
+ private void tryComputeMinMaxPredicate(Analyzer analyzer, Expr pred) {
+ if (pred instanceof BinaryPredicate) {
+ tryComputeBinaryMinMaxPredicate(analyzer, (BinaryPredicate) pred);
+ } else if (pred instanceof InPredicate) {
+ tryComputeInListMinMaxPredicate(analyzer, (InPredicate) pred);
+ }
+ }
+
+ /**
+ * Populates notEmptyCollections_ based on IsNotEmptyPredicates in the given conjuncts.
+ */
+ private void addNotEmptyCollections(List<Expr> conjuncts) {
+ for (Expr expr : conjuncts) {
+ if (expr instanceof IsNotEmptyPredicate) {
+ SlotRef ref = (SlotRef)((IsNotEmptyPredicate)expr).getChild(0);
+ Preconditions.checkState(ref.getDesc().getType().isComplexType());
+ Preconditions.checkState(ref.getDesc().getItemTupleDesc() != null);
+ notEmptyCollections_.add(ref.getDesc().getItemTupleDesc());
+ }
+ }
}
/**
@@ -505,11 +532,17 @@ public class HdfsScanNode extends ScanNode {
minMaxTuple_ = descTbl.createTupleDescriptor(tupleName);
minMaxTuple_.setPath(desc_.getPath());
- for (Expr pred: conjuncts_) {
- if (pred instanceof BinaryPredicate) {
- tryComputeBinaryMinMaxPredicate(analyzer, (BinaryPredicate) pred);
- } else if (pred instanceof InPredicate) {
- tryComputeInListMinMaxPredicate(analyzer, (InPredicate) pred);
+ // Adds predicates for scalar, top-level columns.
+ for (Expr pred: conjuncts_) tryComputeMinMaxPredicate(analyzer, pred);
+
+ // Adds predicates for collections.
+ for (Map.Entry<TupleDescriptor, List<Expr>> entry: collectionConjuncts_.entrySet()) {
+ // Adds only predicates for collections that are filtered by an IsNotEmptyPredicate.
+ // It is assumed that analysis adds these filters such that they are correct, but
+ // potentially conservative. See the tests for examples that could benefit from
+ // being more aggressive (yet still correct).
+ if (notEmptyCollections_.contains(entry.getKey())) {
+ for (Expr pred: entry.getValue()) tryComputeMinMaxPredicate(analyzer, pred);
}
}
minMaxTuple_.computeMemLayout();
@@ -517,7 +550,7 @@ public class HdfsScanNode extends ScanNode {
/**
* Recursively collects and assigns conjuncts bound by tuples materialized in a
- * collection-typed slot.
+ * collection-typed slot. As conjuncts are seen, collect non-empty nested collections.
*
* Limitation: Conjuncts that must first be migrated into inline views and that cannot
* be captured by slot binding will not be assigned here, but in an UnnestNode.
@@ -525,7 +558,7 @@ public class HdfsScanNode extends ScanNode {
* non-SlotRef exprs in the inline-view's select list. We only capture value transfers
* between slots, and not between arbitrary exprs.
*
- * TODO for 2.3: The logic for gathering conjuncts and deciding which ones should be
+ * TODO: The logic for gathering conjuncts and deciding which ones should be
* marked as assigned needs to be clarified and consolidated in one place. The code
* below is rather different from the code for assigning the top-level conjuncts in
* init() although the performed tasks is conceptually identical. Refactoring the
@@ -560,6 +593,7 @@ public class HdfsScanNode extends ScanNode {
if (!collectionConjuncts.isEmpty()) {
analyzer.materializeSlots(collectionConjuncts);
collectionConjuncts_.put(itemTupleDesc, collectionConjuncts);
+ addNotEmptyCollections(collectionConjuncts);
}
// Recursively look for collection-typed slots in nested tuple descriptors.
assignCollectionConjuncts(itemTupleDesc, analyzer);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
index 1e156ba..1a8b6a7 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
@@ -53,7 +53,7 @@ PLAN-ROOT SINK
stats-rows=150000 extrapolated-rows=disabled
table stats: rows=150000 size=292.35MB
columns missing stats: c_orders
- parquet statistics predicates: c_custkey > 10
+ parquet statistics predicates: c_custkey > 10, o_orderkey = 4
parquet dictionary predicates: c_custkey > 10
mem-estimate=176.00MB mem-reservation=0B
tuple-ids=0 row-size=24B cardinality=15000
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
index dcebf07..506921a 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
@@ -242,7 +242,7 @@ PLAN-ROOT SINK
stats-rows=150000 extrapolated-rows=disabled
table stats: rows=150000 size=292.35MB
columns missing stats: c_orders
- parquet statistics predicates: c_custkey < 10
+ parquet statistics predicates: c_custkey < 10, o_orderkey < 5, l_linenumber < 3
parquet dictionary predicates: c_custkey < 10
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=254B cardinality=15000
@@ -302,7 +302,7 @@ Per-Host Resources: mem-estimate=264.00MB mem-reservation=0B
stats-rows=150000 extrapolated-rows=disabled
table stats: rows=150000 size=292.35MB
columns missing stats: c_orders
- parquet statistics predicates: c_custkey < 10
+ parquet statistics predicates: c_custkey < 10, o_orderkey < 5, l_linenumber < 3
parquet dictionary predicates: c_custkey < 10
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=254B cardinality=15000
@@ -353,6 +353,7 @@ PLAN-ROOT SINK
stats-rows=150000 extrapolated-rows=disabled
table stats: rows=150000 size=292.35MB
columns missing stats: c_orders, c_orders
+ parquet statistics predicates: o1.o_orderkey < 5
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=270B cardinality=150000
---- PARALLELPLANS
@@ -403,6 +404,7 @@ Per-Host Resources: mem-estimate=269.81MB mem-reservation=5.81MB
stats-rows=150000 extrapolated-rows=disabled
table stats: rows=150000 size=292.35MB
columns missing stats: c_orders, c_orders
+ parquet statistics predicates: o1.o_orderkey < 5
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=270B cardinality=150000
====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
index 4165e70..bdb9102 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
@@ -88,3 +88,278 @@ PLAN-ROOT SINK
mem-estimate=48.00MB mem-reservation=0B
tuple-ids=0 row-size=24B cardinality=unavailable
====
+# Test collection types where all collections on the path are required (inner
+# join descent). Expect the scan node to include !empty checks for both collections and
+# the min-max filtering for the leaf predicate.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d cn, cn.item a
+where a.item.e < -10;
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+| Per-Host Resources: mem-estimate=32.00MB mem-reservation=0B
+PLAN-ROOT SINK
+| mem-estimate=0B mem-reservation=0B
+|
+01:SUBPLAN
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=2,1,0 row-size=44B cardinality=unavailable
+|
+|--08:NESTED LOOP JOIN [CROSS JOIN]
+| | mem-estimate=24B mem-reservation=0B
+| | tuple-ids=2,1,0 row-size=44B cardinality=100
+| |
+| |--02:SINGULAR ROW SRC
+| | parent-subplan=01
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=0 row-size=24B cardinality=1
+| |
+| 04:SUBPLAN
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2,1 row-size=20B cardinality=100
+| |
+| |--07:NESTED LOOP JOIN [CROSS JOIN]
+| | | mem-estimate=16B mem-reservation=0B
+| | | tuple-ids=2,1 row-size=20B cardinality=10
+| | |
+| | |--05:SINGULAR ROW SRC
+| | | parent-subplan=04
+| | | mem-estimate=0B mem-reservation=0B
+| | | tuple-ids=1 row-size=16B cardinality=1
+| | |
+| | 06:UNNEST [cn.item a]
+| | parent-subplan=04
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2 row-size=0B cardinality=10
+| |
+| 03:UNNEST [c.nested_struct.c.d cn]
+| parent-subplan=01
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1 row-size=0B cardinality=10
+|
+00:SCAN HDFS [functional_parquet.complextypestbl c]
+ partitions=1/1 files=2 size=6.92KB
+ predicates: !empty(c.nested_struct.c.d)
+ predicates on cn: !empty(cn.item)
+ predicates on a: a.item.e < -10
+ stats-rows=unavailable extrapolated-rows=disabled
+ table stats: rows=unavailable size=6.92KB
+ columns missing stats: id
+ parquet statistics predicates: a.item.e < -10
+ mem-estimate=32.00MB mem-reservation=0B
+ tuple-ids=0 row-size=24B cardinality=unavailable
+====
+# Test collection types where the lower collection in the path is optional
+# (outer join descent) and the upper is required (inner join descent).
+# Expect the scan node to include !empty test for the root, but no min-max
+# filter for the leaf (since it does not have a !empty check).
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d cn
+left outer join cn.item a
+where a.item.e < -10;
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+| Per-Host Resources: mem-estimate=32.00MB mem-reservation=0B
+PLAN-ROOT SINK
+| mem-estimate=0B mem-reservation=0B
+|
+01:SUBPLAN
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=2N,1,0 row-size=44B cardinality=unavailable
+|
+|--08:SUBPLAN
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2N,1,0 row-size=44B cardinality=10
+| |
+| |--06:NESTED LOOP JOIN [RIGHT OUTER JOIN]
+| | | predicates: a.item.e < -10
+| | | mem-estimate=40B mem-reservation=0B
+| | | tuple-ids=2N,1,0 row-size=44B cardinality=1
+| | |
+| | |--04:SINGULAR ROW SRC
+| | | parent-subplan=08
+| | | mem-estimate=0B mem-reservation=0B
+| | | tuple-ids=1,0 row-size=40B cardinality=1
+| | |
+| | 05:UNNEST [cn.item a]
+| | parent-subplan=08
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2 row-size=0B cardinality=10
+| |
+| 07:NESTED LOOP JOIN [CROSS JOIN]
+| | mem-estimate=24B mem-reservation=0B
+| | tuple-ids=1,0 row-size=40B cardinality=10
+| |
+| |--02:SINGULAR ROW SRC
+| | parent-subplan=01
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=0 row-size=24B cardinality=1
+| |
+| 03:UNNEST [c.nested_struct.c.d cn]
+| parent-subplan=01
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1 row-size=0B cardinality=10
+|
+00:SCAN HDFS [functional_parquet.complextypestbl c]
+ partitions=1/1 files=2 size=6.92KB
+ predicates: !empty(c.nested_struct.c.d)
+ predicates on a: a.item.e < -10
+ stats-rows=unavailable extrapolated-rows=disabled
+ table stats: rows=unavailable size=6.92KB
+ columns missing stats: id
+ mem-estimate=32.00MB mem-reservation=0B
+ tuple-ids=0 row-size=24B cardinality=unavailable
+====
+# Tests collection types where the outer is optional (outer join descent)
+# and the inner is required (inner join descent). In this case, !empty is
+# not pushed for either collection, so there is no min-max pruning either.
+select id from functional_parquet.complextypestbl c
+left outer join c.nested_struct.c.d cn, cn.item a where a.item.e < -10;
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+| Per-Host Resources: mem-estimate=32.00MB mem-reservation=0B
+PLAN-ROOT SINK
+| mem-estimate=0B mem-reservation=0B
+|
+01:SUBPLAN
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=2,1N,0 row-size=44B cardinality=unavailable
+|
+|--08:SUBPLAN
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2,1N,0 row-size=44B cardinality=10
+| |
+| |--06:NESTED LOOP JOIN [CROSS JOIN]
+| | | mem-estimate=40B mem-reservation=0B
+| | | tuple-ids=2,1N,0 row-size=44B cardinality=10
+| | |
+| | |--04:SINGULAR ROW SRC
+| | | parent-subplan=08
+| | | mem-estimate=0B mem-reservation=0B
+| | | tuple-ids=1N,0 row-size=40B cardinality=1
+| | |
+| | 05:UNNEST [cn.item a]
+| | parent-subplan=08
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2 row-size=0B cardinality=10
+| |
+| 07:NESTED LOOP JOIN [RIGHT OUTER JOIN]
+| | mem-estimate=24B mem-reservation=0B
+| | tuple-ids=1N,0 row-size=40B cardinality=1
+| |
+| |--02:SINGULAR ROW SRC
+| | parent-subplan=01
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=0 row-size=24B cardinality=1
+| |
+| 03:UNNEST [c.nested_struct.c.d cn]
+| parent-subplan=01
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1 row-size=0B cardinality=10
+|
+00:SCAN HDFS [functional_parquet.complextypestbl c]
+ partitions=1/1 files=2 size=6.92KB
+ predicates on a: a.item.e < -10
+ stats-rows=unavailable extrapolated-rows=disabled
+ table stats: rows=unavailable size=6.92KB
+ columns missing stats: id
+ mem-estimate=32.00MB mem-reservation=0B
+ tuple-ids=0 row-size=24B cardinality=unavailable
+====
+# Test collections so that each level has a filter applied.
+select c_custkey from tpch_nested_parquet.customer c, c.c_orders o,
+o.o_lineitems l where c_custkey > 0 and o.o_orderkey > 0 and l.l_partkey > 0;
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+| Per-Host Resources: mem-estimate=176.00MB mem-reservation=0B
+PLAN-ROOT SINK
+| mem-estimate=0B mem-reservation=0B
+|
+01:SUBPLAN
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=2,1,0 row-size=56B cardinality=1500000
+|
+|--08:NESTED LOOP JOIN [CROSS JOIN]
+| | mem-estimate=24B mem-reservation=0B
+| | tuple-ids=2,1,0 row-size=56B cardinality=100
+| |
+| |--02:SINGULAR ROW SRC
+| | parent-subplan=01
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=0 row-size=24B cardinality=1
+| |
+| 04:SUBPLAN
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2,1 row-size=32B cardinality=100
+| |
+| |--07:NESTED LOOP JOIN [CROSS JOIN]
+| | | mem-estimate=24B mem-reservation=0B
+| | | tuple-ids=2,1 row-size=32B cardinality=10
+| | |
+| | |--05:SINGULAR ROW SRC
+| | | parent-subplan=04
+| | | mem-estimate=0B mem-reservation=0B
+| | | tuple-ids=1 row-size=24B cardinality=1
+| | |
+| | 06:UNNEST [o.o_lineitems l]
+| | parent-subplan=04
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=2 row-size=0B cardinality=10
+| |
+| 03:UNNEST [c.c_orders o]
+| parent-subplan=01
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1 row-size=0B cardinality=10
+|
+00:SCAN HDFS [tpch_nested_parquet.customer c]
+ partitions=1/1 files=4 size=292.36MB
+ predicates: c_custkey > 0, !empty(c.c_orders)
+ predicates on o: !empty(o.o_lineitems), o.o_orderkey > 0
+ predicates on l: l.l_partkey > 0
+ stats-rows=150000 extrapolated-rows=disabled
+ table stats: rows=150000 size=292.36MB
+ columns missing stats: c_orders
+ parquet statistics predicates: c_custkey > 0, o.o_orderkey > 0, l.l_partkey > 0
+ parquet dictionary predicates: c_custkey > 0
+ mem-estimate=176.00MB mem-reservation=0B
+ tuple-ids=0 row-size=24B cardinality=15000
+====
+# Test collections in a way that would incorrectly apply a min-max
+# filter at the scan. Expect no min-max filter and no !empty tests.
+select count(*) from functional_parquet.complextypestbl c left outer join
+(select * from c.int_array where item > 10) v;
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+| Per-Host Resources: mem-estimate=26.00MB mem-reservation=0B
+PLAN-ROOT SINK
+| mem-estimate=0B mem-reservation=0B
+|
+05:AGGREGATE [FINALIZE]
+| output: count(*)
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
+| tuple-ids=3 row-size=8B cardinality=1
+|
+01:SUBPLAN
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1N,0 row-size=20B cardinality=unavailable
+|
+|--04:NESTED LOOP JOIN [RIGHT OUTER JOIN]
+| | mem-estimate=16B mem-reservation=0B
+| | tuple-ids=1N,0 row-size=20B cardinality=1
+| |
+| |--02:SINGULAR ROW SRC
+| | parent-subplan=01
+| | mem-estimate=0B mem-reservation=0B
+| | tuple-ids=0 row-size=16B cardinality=1
+| |
+| 03:UNNEST [c.int_array]
+| parent-subplan=01
+| mem-estimate=0B mem-reservation=0B
+| tuple-ids=1 row-size=4B cardinality=10
+|
+00:SCAN HDFS [functional_parquet.complextypestbl c]
+ partitions=1/1 files=2 size=6.92KB
+ predicates on int_array: item > 10
+ stats-rows=unavailable extrapolated-rows=disabled
+ table stats: rows=unavailable size=6.92KB
+ column stats: unavailable
+ mem-estimate=16.00MB mem-reservation=0B
+ tuple-ids=0 row-size=16B cardinality=unavailable
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-stats.test
index c8ba303..32a9a28 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-stats.test
@@ -5,8 +5,8 @@ select count(*) from functional_parquet.complextypestbl where id < 1
---- RESULTS
0
---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 2 .*
-row_regex: .*NumStatsFilteredRowGroups: 2 .*
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
====
---- QUERY
# Filter root-level scalar column in file with nested types.
@@ -15,16 +15,290 @@ from functional_parquet.complextypestbl, complextypestbl.int_array
where id < 0;
---- RESULTS
---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 2 .*
-row_regex: .*NumStatsFilteredRowGroups: 2 .*
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
====
---- QUERY
-# Nested columns do not support stats based filtering.
+# Row-group skipping based on min-max stats for collection types.
+# Cases tested:
+# - collection required vs. not required (use outer joins)
+# - collection type: array, map, struct
+#
+# Array collection.
select id, int_array.item
from functional_parquet.complextypestbl, functional_parquet.complextypestbl.int_array
where int_array.item < -1;
---- RESULTS
---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 2 .*
-row_regex: .*NumStatsFilteredRowGroups: 0 .*
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Map collection
+select id, int_map.key
+from functional_parquet.complextypestbl, functional_parquet.complextypestbl.int_map
+where int_map.value < -1;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Multiple nesting, all filtered
+select id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item < -2;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Multiple nesting, some filtered. Expect some groups filtered, not all. < filter.
+select id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item < -1;
+---- RESULTS
+8
+---- TYPES
+bigint
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Multiple nesting, some filtered. Expect some groups filtered, not all. = filter.
+select id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item = -2;
+---- RESULTS
+8
+---- TYPES
+bigint
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Multiple nesting, some filtered. Expect some groups filtered, not all. IN filter.
+select id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item in (5,6);
+---- RESULTS
+7
+7
+---- TYPES
+bigint
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Multiple nesting, some filtered. Expect some groups filtered, not all. complex filter.
+select id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item > -2 and bottom.item in (-2, -1);
+---- RESULTS
+8
+---- TYPES
+bigint
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Multiple nesting, some filtered. Expect some groups filtered, not all. complex filter.
+select distinct id from functional_parquet.complextypestbl c, c.int_array_array cn, cn.item bottom
+where bottom.item > 2 and bottom.item not in (4,5,6);
+---- RESULTS
+1
+2
+---- TYPES
+bigint
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Multiple nesting, mixed collection type
+select id from functional_parquet.complextypestbl c, c.int_map_array cn, cn.item m
+where m.value < -2;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Scalar in struct
+select id from functional_parquet.complextypestbl c where c.nested_struct.a < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Collection in struct
+select id from functional_parquet.complextypestbl c, c.nested_struct.b a
+where a.item < -1;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Collection in struct
+select id from functional_parquet.complextypestbl c, c.nested_struct.b a
+where -1 > a.item;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Deeply nested collection, all required.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d cn, cn.item a
+where a.item.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Deeply nested collection, top required, bottom not required.
+# TODO: with more aggressive pruning, this should skip groups as well.
+# see the false pruning example below for a case that needs the
+# more restrictive !empty guard. See org.apache.impala.planner.HdfsScanNode
+# for more details on when/why !empty guards are inserted during analysis.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d cn
+left outer join cn.item a
+where a.item.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Deeply nested collection, top not required, bottom required (no !empty guard)
+select id from functional_parquet.complextypestbl c
+left outer join c.nested_struct.c.d cn, cn.item a where a.item.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Multiple collections, all required.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d a, a.item aa,
+c.nested_struct.g.value.h.i b where aa.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Multiple collections, all required just on filtering path.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d a,
+a.item aa left outer join c.nested_struct.g.value.h.i b where aa.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 2
+====
+---- QUERY
+# Multiple collections, not required on filtering path.
+select id from functional_parquet.complextypestbl c, c.nested_struct.c.d a
+left outer join a.item aa, c.nested_struct.g.value.h.i b where aa.e < -10;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# False pruning example. There is one table that's scanned (complextypestbl).
+# As a result, there is one scan. The predicate on the complex type is pushed
+# down to the scan, but the field is not required. If we erronenously prune,
+# nothing will be returned instead of the expected left-outer-join result.
+select count(*) from functional_parquet.complextypestbl c left outer join
+(select * from c.int_array where item > 10) v;
+---- RESULTS
+8
+---- TYPES
+BIGINT
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Similar, but filter is not selective, so the outer nested type is repeated for
+# several tuples. Again, no pruning is expected.
+select count(*) from functional_parquet.complextypestbl c left outer join
+(select * from c.int_array where item > -10) v;
+---- RESULTS
+12
+---- TYPES
+BIGINT
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+#
+# Min-max collection filtering for tpch data.
+#
+# Array, single level, struct type:
+select c.c_custkey from tpch_nested_parquet.customer c, c.c_orders o
+where o.o_orderkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 4
+====
+---- QUERY
+# Array, multi-level, all required.
+select c.c_custkey from tpch_nested_parquet.customer c, c.c_orders o, o.o_lineitems l
+where l.l_partkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 4
+====
+---- QUERY
+# Array, single level, optional
+select c.c_custkey from tpch_nested_parquet.customer c left outer join c.c_orders o
+where o.o_orderkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Array, multi-level, all optional
+select c.c_custkey from tpch_nested_parquet.customer c left outer join c.c_orders o
+left outer join o.o_lineitems l where l.l_partkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Array, multi-level, optional top
+select c.c_custkey from tpch_nested_parquet.customer c left outer join c.c_orders o,
+o.o_lineitems l where l.l_partkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Array, multi-level, optional bottom, filter bottom
+select c.c_custkey from tpch_nested_parquet.customer c, c.c_orders o left outer join
+o.o_lineitems l where l.l_partkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====
+---- QUERY
+# Array, multi-level, optional bottom, filter top
+select c.c_custkey from tpch_nested_parquet.customer c, c.c_orders o left outer join
+o.o_lineitems l where o.o_orderkey < 0;
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumStatsFilteredRowGroups): 4
====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/21a96ed2/tests/query_test/test_nested_types.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py
index 20def2c..68021d5 100644
--- a/tests/query_test/test_nested_types.py
+++ b/tests/query_test/test_nested_types.py
@@ -80,9 +80,6 @@ class TestNestedTypes(ImpalaTestSuite):
def test_parquet_stats(self, vector):
"""Queries that test evaluation of Parquet row group statistics."""
- # The test makes assumptions about the number of row groups that are processed and
- # skipped inside a fragment, so we ensure that the tests run in a single fragment.
- vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/nested-types-parquet-stats', vector)
@SkipIfIsilon.hive