You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2018/08/02 10:55:39 UTC
[6/6] hive git commit: HIVE-20260: NDV of a column shouldn't be
scaled when row count is changed by filter on another column (Zoltan
Haindrich reviewed by Ashutosh Chauhan)
HIVE-20260: NDV of a column shouldn't be scaled when row count is changed by filter on another column (Zoltan Haindrich reviewed by Ashutosh Chauhan)
Signed-off-by: Zoltan Haindrich <ki...@rxd.hu>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5c02fee2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5c02fee2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5c02fee2
Branch: refs/heads/master
Commit: 5c02fee2814fa43ec385f817f1fd25795fcfc8b9
Parents: 2cabb8d
Author: Zoltan Haindrich <ki...@rxd.hu>
Authored: Thu Aug 2 12:27:18 2018 +0200
Committer: Zoltan Haindrich <ki...@rxd.hu>
Committed: Thu Aug 2 12:27:56 2018 +0200
----------------------------------------------------------------------
.../stats/annotation/AnnotateStatsProcCtx.java | 19 +
.../stats/annotation/StatsRulesProcFactory.java | 262 ++++-----
.../clientpositive/groupby_groupingset_bug.q | 4 +-
.../queries/clientpositive/reopt_semijoin.q | 3 +-
.../clientpositive/stat_estimate_drill.q | 28 +
.../clientpositive/stat_estimate_related_col.q | 3 +
.../annotate_stats_deep_filters.q.out | 4 +-
.../clientpositive/cbo_rp_auto_join1.q.out | 4 +-
.../clientpositive/llap/auto_join29.q.out | 70 +--
.../llap/auto_smb_mapjoin_14.q.out | 14 +-
.../llap/auto_sortmerge_join_10.q.out | 4 +-
.../llap/auto_sortmerge_join_9.q.out | 32 +-
.../llap/bucket_map_join_tez2.q.out | 40 +-
.../llap/bucketsortoptimize_insert_7.q.out | 12 +-
.../llap/constprog_semijoin.q.out | 14 +-
.../llap/correlationoptimizer1.q.out | 20 +-
.../llap/correlationoptimizer2.q.out | 52 +-
.../llap/correlationoptimizer6.q.out | 12 +-
.../llap/dynpart_sort_opt_vectorization.q.out | 30 +-
.../clientpositive/llap/explainuser_1.q.out | 74 +--
.../clientpositive/llap/explainuser_2.q.out | 56 +-
.../clientpositive/llap/explainuser_4.q.out | 14 +-
.../llap/filter_join_breaktask.q.out | 12 +-
.../llap/groupby_groupingset_bug.q.out | 73 ++-
.../llap/hybridgrace_hashjoin_1.q.out | 4 +-
.../clientpositive/llap/limit_pushdown.q.out | 6 +-
.../clientpositive/llap/load_dyn_part1.q.out | 20 +-
.../materialized_view_create_rewrite_3.q.out | 20 +-
...ized_view_create_rewrite_rebuild_dummy.q.out | 20 +-
.../test/results/clientpositive/llap/mrr.q.out | 10 +-
.../clientpositive/llap/multiMapJoin2.q.out | 4 +-
.../results/clientpositive/llap/orc_llap.q.out | 8 +-
.../llap/orc_predicate_pushdown.q.out | 20 +-
.../llap/parquet_predicate_pushdown.q.out | 20 +-
.../clientpositive/llap/reopt_semijoin.q.out | 28 +-
.../results/clientpositive/llap/sample10.q.out | 12 +-
.../clientpositive/llap/sample10_mm.q.out | 8 +-
.../results/clientpositive/llap/semijoin.q.out | 4 +-
.../results/clientpositive/llap/semijoin6.q.out | 14 +-
.../results/clientpositive/llap/semijoin7.q.out | 14 +-
.../results/clientpositive/llap/skewjoin.q.out | 2 +-
.../clientpositive/llap/smb_mapjoin_14.q.out | 16 +-
.../clientpositive/llap/subquery_exists.q.out | 4 +-
.../clientpositive/llap/subquery_in.q.out | 92 ++--
.../llap/subquery_in_having.q.out | 30 +-
.../clientpositive/llap/subquery_multi.q.out | 52 +-
.../clientpositive/llap/subquery_notin.q.out | 110 ++--
.../clientpositive/llap/subquery_scalar.q.out | 68 +--
.../clientpositive/llap/subquery_select.q.out | 50 +-
.../clientpositive/llap/subquery_views.q.out | 28 +-
.../llap/tez_dynpart_hashjoin_2.q.out | 24 +-
.../llap/tez_vector_dynpart_hashjoin_2.q.out | 24 +-
.../clientpositive/llap/unionDistinct_1.q.out | 18 +-
.../clientpositive/llap/unionDistinct_3.q.out | 4 +-
.../llap/vector_groupby_grouping_sets2.q.out | 6 +-
.../llap/vector_mapjoin_reduce.q.out | 22 +-
.../llap/vector_windowing_gby.q.out | 2 +-
.../llap/vector_windowing_gby2.q.out | 2 +-
.../clientpositive/llap/vectorization_0.q.out | 12 +-
.../llap/vectorization_limit.q.out | 10 +-
.../llap/vectorization_short_regress.q.out | 30 +-
.../llap/vectorized_context.q.out | 10 +-
.../llap/vectorized_mapjoin.q.out | 4 +-
.../llap/vectorized_nested_mapjoin.q.out | 6 +-
.../llap/vectorized_shufflejoin.q.out | 4 +-
.../clientpositive/llap/windowing_gby.q.out | 2 +-
.../spark/spark_explainuser_1.q.out | 34 +-
.../clientpositive/stat_estimate_drill.q.out | 526 +++++++++++++++++++
.../stat_estimate_related_col.q.out | 147 +++---
.../clientpositive/tez/explainanalyze_1.q.out | 4 +-
.../clientpositive/tez/explainanalyze_4.q.out | 12 +-
.../tez/hybridgrace_hashjoin_1.q.out | 4 +-
72 files changed, 1515 insertions(+), 912 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
index 47ee949..0b7f14f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
@@ -18,9 +18,13 @@
package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
public class AnnotateStatsProcCtx implements NodeProcessorCtx {
@@ -28,6 +32,8 @@ public class AnnotateStatsProcCtx implements NodeProcessorCtx {
private ParseContext pctx;
private HiveConf conf;
private Statistics andExprStats = null;
+ private Set<String> affectedColumns;
+
public AnnotateStatsProcCtx(ParseContext pctx) {
this.setParseContext(pctx);
@@ -36,6 +42,7 @@ public class AnnotateStatsProcCtx implements NodeProcessorCtx {
} else {
this.setConf(null);
}
+ affectedColumns = new HashSet<>();
}
public HiveConf getConf() {
@@ -62,4 +69,16 @@ public class AnnotateStatsProcCtx implements NodeProcessorCtx {
this.andExprStats = andExprStats;
}
+ public void clearAffectedColumns() {
+ affectedColumns.clear();
+ }
+
+ public void addAffectedColumn(ExprNodeColumnDesc column) {
+ affectedColumns.add(column.getColumn());
+ }
+
+ public Set<String> getAffectedColumns() {
+ return affectedColumns;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 997e289..01179c8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -30,6 +30,7 @@ import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.Stack;
+
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.Context;
@@ -102,7 +103,6 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -142,7 +142,7 @@ public class StatsRulesProcFactory {
// gather statistics for the first time and the attach it to table scan operator
Statistics stats = StatsUtils.collectStatistics(aspCtx.getConf(), partList, colStatsCached, table, tsop);
- stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, (Operator<?>) tsop);
+ stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, tsop);
tsop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -150,7 +150,7 @@ public class StatsRulesProcFactory {
stats.extendedToString());
}
} catch (HiveException e) {
- LOG.debug("Failed to retrieve stats ",e);
+ LOG.debug("Failed to retrieve stats ", e);
throw new SemanticException(e);
}
return null;
@@ -201,7 +201,7 @@ public class StatsRulesProcFactory {
long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats);
stats.setDataSize(dataSize);
}
- stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, (Operator<?>) sop);
+ stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, sop);
sop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -209,7 +209,7 @@ public class StatsRulesProcFactory {
}
} else {
if (parentStats != null) {
- stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, (Operator<?>) sop);
+ stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, sop);
sop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -280,11 +280,11 @@ public class StatsRulesProcFactory {
neededCols = tsop.getNeededColumns();
}
-
if (parentStats != null) {
ExprNodeDesc pred = fop.getConf().getPredicate();
// evaluate filter expression and update statistics
+ aspCtx.clearAffectedColumns();
long newNumRows = evaluateExpression(parentStats, pred, aspCtx,
neededCols, fop, parentStats.getNumRows());
Statistics st = parentStats.clone();
@@ -296,7 +296,7 @@ public class StatsRulesProcFactory {
// result in number of rows getting more than the input rows in
// which case stats need not be updated
if (newNumRows <= parentStats.getNumRows()) {
- updateStats(st, newNumRows, true, fop);
+ updateStats(st, newNumRows, true, fop, aspCtx.getAffectedColumns());
}
if (LOG.isDebugEnabled()) {
@@ -314,7 +314,7 @@ public class StatsRulesProcFactory {
}
}
- st = applyRuntimeStats(aspCtx.getParseContext().getContext(), st, (Operator<?>) fop);
+ st = applyRuntimeStats(aspCtx.getParseContext().getContext(), st, fop);
fop.setStatistics(st);
aspCtx.setAndExprStats(null);
@@ -348,11 +348,16 @@ public class StatsRulesProcFactory {
// evaluate children
long evaluatedRowCount = currNumRows;
for (ExprNodeDesc child : genFunc.getChildren()) {
+ aspCtx.clearAffectedColumns();
evaluatedRowCount = evaluateChildExpr(aspCtx.getAndExprStats(), child,
aspCtx, neededCols, op, evaluatedRowCount);
newNumRows = evaluatedRowCount;
if (satisfyPrecondition(aspCtx.getAndExprStats())) {
- updateStats(aspCtx.getAndExprStats(), newNumRows, true, op);
+ // Assumption is that columns are uncorrelated.
+ // Ndv is reduced in a conservative manner - only taking affected columns
+ // (which might be a subset of the actual *real* affected columns due to current limitation)
+ // Goal is to not let a situation in which ndv-s asre underestimated happen.
+ updateStats(aspCtx.getAndExprStats(), newNumRows, true, op, aspCtx.getAffectedColumns());
} else {
updateStats(aspCtx.getAndExprStats(), newNumRows, false, op);
}
@@ -360,11 +365,14 @@ public class StatsRulesProcFactory {
} else if (udf instanceof GenericUDFOPOr) {
// for OR condition independently compute and update stats.
for (ExprNodeDesc child : genFunc.getChildren()) {
- newNumRows = StatsUtils.safeAdd(
- evaluateChildExpr(stats, child, aspCtx, neededCols, op, currNumRows),
- newNumRows);
- }
- if(newNumRows > currNumRows) {
+ newNumRows = StatsUtils.safeAdd(
+ evaluateChildExpr(stats, child, aspCtx, neededCols, op, currNumRows),
+ newNumRows);
+ }
+ // We have to clear the affected columns
+ // since currently it is not possible to get a real estimate of an or expression.
+ aspCtx.clearAffectedColumns();
+ if (newNumRows > currNumRows) {
newNumRows = currNumRows;
}
} else if (udf instanceof GenericUDFIn) {
@@ -376,15 +384,16 @@ public class StatsRulesProcFactory {
} else if (udf instanceof GenericUDFOPNot) {
newNumRows = evaluateNotExpr(stats, pred, currNumRows, aspCtx, neededCols, op);
} else if (udf instanceof GenericUDFOPNotNull) {
- return evaluateNotNullExpr(stats, genFunc, currNumRows);
+ return evaluateNotNullExpr(stats, aspCtx, genFunc, currNumRows);
} else {
// single predicate condition
- newNumRows = evaluateChildExpr(stats, pred, aspCtx, neededCols, op,currNumRows);
+ newNumRows = evaluateChildExpr(stats, pred, aspCtx, neededCols, op, currNumRows);
}
} else if (pred instanceof ExprNodeColumnDesc) {
// can be boolean column in which case return true count
ExprNodeColumnDesc encd = (ExprNodeColumnDesc) pred;
+ aspCtx.addAffectedColumn(encd);
String colName = encd.getColumn();
String colType = encd.getTypeString();
if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
@@ -419,7 +428,7 @@ public class StatsRulesProcFactory {
}
private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx,
- List<String> neededCols, Operator<?> op) throws SemanticException {
+ List<String> neededCols, Operator<?> op) throws SemanticException {
long numRows = currNumRows;
@@ -433,7 +442,7 @@ public class StatsRulesProcFactory {
ExprNodeDesc columnsChild = children.get(0);
boolean multiColumn;
if (columnsChild instanceof ExprNodeGenericFuncDesc &&
- ((ExprNodeGenericFuncDesc) columnsChild).getGenericUDF() instanceof GenericUDFStruct) {
+ ((ExprNodeGenericFuncDesc) columnsChild).getGenericUDF() instanceof GenericUDFStruct) {
for (int j = 0; j < columnsChild.getChildren().size(); j++) {
ExprNodeDesc columnChild = columnsChild.getChildren().get(j);
// If column is not column reference , we bail out
@@ -442,7 +451,8 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
columns.add(columnChild);
- final String columnName = ((ExprNodeColumnDesc)columnChild).getColumn();
+ aspCtx.addAffectedColumn((ExprNodeColumnDesc) columnChild);
+ final String columnName = ((ExprNodeColumnDesc) columnChild).getColumn();
// if column name is not contained in needed column list then it
// is a partition column. We do not need to evaluate partition columns
// in filter expression since it will be taken care by partition pruner
@@ -451,7 +461,7 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
columnStats.add(stats.getColumnStatisticsFromColName(columnName));
- values.add(Sets.<ExprNodeDescEqualityWrapper>newHashSet());
+ values.add(Sets.<ExprNodeDescEqualityWrapper> newHashSet());
}
multiColumn = true;
} else {
@@ -461,7 +471,8 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
columns.add(columnsChild);
- final String columnName = ((ExprNodeColumnDesc)columnsChild).getColumn();
+ aspCtx.addAffectedColumn((ExprNodeColumnDesc) columnsChild);
+ final String columnName = ((ExprNodeColumnDesc) columnsChild).getColumn();
// if column name is not contained in needed column list then it
// is a partition column. We do not need to evaluate partition columns
// in filter expression since it will be taken care by partition pruner
@@ -470,7 +481,7 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
columnStats.add(stats.getColumnStatisticsFromColName(columnName));
- values.add(Sets.<ExprNodeDescEqualityWrapper>newHashSet());
+ values.add(Sets.<ExprNodeDescEqualityWrapper> newHashSet());
multiColumn = false;
}
@@ -521,7 +532,7 @@ public class StatsRulesProcFactory {
factor = Double.max(factor, HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_MIN_RATIO));
}
float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
- return Math.round( numRows * factor * inFactor);
+ return Math.round(numRows * factor * inFactor);
}
static class RangeOps {
@@ -663,7 +674,7 @@ public class StatsRulesProcFactory {
}
private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx,
- List<String> neededCols, Operator<?> op) throws SemanticException {
+ List<String> neededCols, Operator<?> op) throws SemanticException {
final ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred;
final boolean invert = Boolean.TRUE.equals(
((ExprNodeConstantDesc) fd.getChildren().get(0)).getValue()); // boolean invert (not)
@@ -688,7 +699,7 @@ public class StatsRulesProcFactory {
new GenericUDFOPAnd(), Lists.newArrayList(leftComparator, rightComparator));
if (invert) {
newExpression = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
- new GenericUDFOPNot(), Lists.newArrayList(newExpression));
+ new GenericUDFOPNot(), Lists.newArrayList(newExpression));
}
return evaluateExpression(stats, newExpression, aspCtx, neededCols, op, currNumRows);
@@ -724,6 +735,7 @@ public class StatsRulesProcFactory {
// NOT on boolean columns is possible. in which case return false count.
ExprNodeColumnDesc encd = (ExprNodeColumnDesc) leaf;
+ aspCtx.addAffectedColumn(encd);
String colName = encd.getColumn();
String colType = encd.getTypeString();
if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
@@ -742,7 +754,8 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
- private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred, long currNumRows) {
+ private long evaluateColEqualsNullExpr(Statistics stats, AnnotateStatsProcCtx aspCtx, ExprNodeDesc pred,
+ long currNumRows) {
long numRows = currNumRows;
@@ -753,6 +766,7 @@ public class StatsRulesProcFactory {
if (leaf instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf;
+ aspCtx.addAffectedColumn(colDesc);
String colName = colDesc.getColumn();
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
@@ -766,8 +780,9 @@ public class StatsRulesProcFactory {
return numRows / 2;
}
- private long evaluateNotNullExpr(Statistics parentStats, ExprNodeGenericFuncDesc pred, long currNumRows) {
- long noOfNulls = getMaxNulls(parentStats, pred);
+ private long evaluateNotNullExpr(Statistics parentStats, AnnotateStatsProcCtx aspCtx, ExprNodeGenericFuncDesc pred,
+ long currNumRows) {
+ long noOfNulls = getMaxNulls(parentStats, aspCtx, pred);
long parentCardinality = currNumRows;
long newPredCardinality = parentCardinality;
@@ -780,20 +795,21 @@ public class StatsRulesProcFactory {
return newPredCardinality;
}
- private long getMaxNulls(Statistics stats, ExprNodeDesc pred) {
+ private long getMaxNulls(Statistics stats, AnnotateStatsProcCtx aspCtx, ExprNodeDesc pred) {
long tmpNoNulls = 0;
long maxNoNulls = 0;
if (pred instanceof ExprNodeColumnDesc) {
- ColStatistics cs = stats.getColumnStatisticsFromColName(((ExprNodeColumnDesc) pred)
- .getColumn());
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc) pred;
+ aspCtx.addAffectedColumn(encd);
+ ColStatistics cs = stats.getColumnStatisticsFromColName(encd.getColumn());
if (cs != null) {
tmpNoNulls = cs.getNumNulls();
}
} else if (pred instanceof ExprNodeGenericFuncDesc || pred instanceof ExprNodeColumnListDesc) {
long noNullsOfChild = 0;
for (ExprNodeDesc childExpr : pred.getChildren()) {
- noNullsOfChild = getMaxNulls(stats, childExpr);
+ noNullsOfChild = getMaxNulls(stats, aspCtx, childExpr);
if (noNullsOfChild > tmpNoNulls) {
tmpNoNulls = noNullsOfChild;
}
@@ -808,7 +824,7 @@ public class StatsRulesProcFactory {
tmpNoNulls = 0;
} else if (pred instanceof ExprNodeFieldDesc) {
// TODO Confirm this is safe
- tmpNoNulls = getMaxNulls(stats, ((ExprNodeFieldDesc) pred).getDesc());
+ tmpNoNulls = getMaxNulls(stats, aspCtx, ((ExprNodeFieldDesc) pred).getDesc());
}
if (tmpNoNulls > maxNoNulls) {
@@ -818,7 +834,8 @@ public class StatsRulesProcFactory {
return maxNoNulls;
}
- private long evaluateComparator(Statistics stats, ExprNodeGenericFuncDesc genFunc, long currNumRows) {
+ private long evaluateComparator(Statistics stats, AnnotateStatsProcCtx aspCtx, ExprNodeGenericFuncDesc genFunc,
+ long currNumRows) {
long numRows = currNumRows;
GenericUDF udf = genFunc.getGenericUDF();
@@ -827,15 +844,16 @@ public class StatsRulesProcFactory {
boolean upperBound;
String boundValue = null;
if (genFunc.getChildren().get(0) instanceof ExprNodeColumnDesc &&
- genFunc.getChildren().get(1) instanceof ExprNodeConstantDesc) {
+ genFunc.getChildren().get(1) instanceof ExprNodeConstantDesc) {
columnDesc = (ExprNodeColumnDesc) genFunc.getChildren().get(0);
constantDesc = (ExprNodeConstantDesc) genFunc.getChildren().get(1);
+ aspCtx.addAffectedColumn(columnDesc);
// Comparison to null will always return false
if (constantDesc.getValue() == null) {
return 0;
}
if (udf instanceof GenericUDFOPEqualOrGreaterThan ||
- udf instanceof GenericUDFOPGreaterThan) {
+ udf instanceof GenericUDFOPGreaterThan) {
boundValue = constantDesc.getValue().toString();
upperBound = false;
} else {
@@ -843,15 +861,16 @@ public class StatsRulesProcFactory {
upperBound = true;
}
} else if (genFunc.getChildren().get(1) instanceof ExprNodeColumnDesc &&
- genFunc.getChildren().get(0) instanceof ExprNodeConstantDesc) {
+ genFunc.getChildren().get(0) instanceof ExprNodeConstantDesc) {
columnDesc = (ExprNodeColumnDesc) genFunc.getChildren().get(1);
constantDesc = (ExprNodeConstantDesc) genFunc.getChildren().get(0);
+ aspCtx.addAffectedColumn(columnDesc);
// Comparison to null will always return false
if (constantDesc.getValue() == null) {
return 0;
}
if (udf instanceof GenericUDFOPEqualOrGreaterThan ||
- udf instanceof GenericUDFOPGreaterThan) {
+ udf instanceof GenericUDFOPGreaterThan) {
boundValue = constantDesc.getValue().toString();
upperBound = true;
} else {
@@ -865,7 +884,7 @@ public class StatsRulesProcFactory {
ColStatistics cs = stats.getColumnStatisticsFromColName(columnDesc.getColumn());
if (cs != null && cs.getRange() != null &&
- cs.getRange().maxValue != null && cs.getRange().minValue != null) {
+ cs.getRange().maxValue != null && cs.getRange().minValue != null) {
String colTypeLowerCase = columnDesc.getTypeString().toLowerCase();
try {
if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)) {
@@ -907,7 +926,7 @@ public class StatsRulesProcFactory {
}
}
} else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) ||
- colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
+ colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
int value;
if (colTypeLowerCase == serdeConstants.DATE_TYPE_NAME) {
DateWritable writableVal = new DateWritable(java.sql.Date.valueOf(boundValue));
@@ -1025,7 +1044,7 @@ public class StatsRulesProcFactory {
// special case: if both constants are not equal then return 0
if (prevConst != null &&
- !prevConst.equals(((ExprNodeConstantDesc)leaf).getValue())) {
+ !prevConst.equals(((ExprNodeConstantDesc) leaf).getValue())) {
return 0;
}
return numRows;
@@ -1048,11 +1067,12 @@ public class StatsRulesProcFactory {
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
long dvs = cs.getCountDistint();
- numRows = dvs == 0 ? numRows / 2 : Math.round( (double)numRows / dvs);
+ numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs);
return numRows;
}
} else if (leaf instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf;
+ aspCtx.addAffectedColumn(colDesc);
colName = colDesc.getColumn();
// if const is first argument then evaluate the result
@@ -1068,7 +1088,7 @@ public class StatsRulesProcFactory {
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
long dvs = cs.getCountDistint();
- numRows = dvs == 0 ? numRows / 2 : Math.round( (double)numRows / dvs);
+ numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs);
return numRows;
}
}
@@ -1077,17 +1097,17 @@ public class StatsRulesProcFactory {
} else if (udf instanceof GenericUDFOPNotEqual) {
return numRows;
} else if (udf instanceof GenericUDFOPEqualOrGreaterThan
- || udf instanceof GenericUDFOPEqualOrLessThan
- || udf instanceof GenericUDFOPGreaterThan
- || udf instanceof GenericUDFOPLessThan) {
- return evaluateComparator(stats, genFunc, numRows);
+ || udf instanceof GenericUDFOPEqualOrLessThan
+ || udf instanceof GenericUDFOPGreaterThan
+ || udf instanceof GenericUDFOPLessThan) {
+ return evaluateComparator(stats, aspCtx, genFunc, numRows);
} else if (udf instanceof GenericUDFOPNotNull) {
- return evaluateNotNullExpr(stats, genFunc, numRows);
+ return evaluateNotNullExpr(stats, aspCtx, genFunc, numRows);
} else if (udf instanceof GenericUDFOPNull) {
- return evaluateColEqualsNullExpr(stats, genFunc, numRows);
+ return evaluateColEqualsNullExpr(stats, aspCtx, genFunc, numRows);
} else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr
- || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween
- || udf instanceof GenericUDFOPNot) {
+ || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween
+ || udf instanceof GenericUDFOPNot) {
return evaluateExpression(stats, genFunc, aspCtx, neededCols, op, numRows);
} else if (udf instanceof GenericUDFInBloomFilter) {
if (genFunc.getChildren().get(1) instanceof ExprNodeDynamicValueDesc) {
@@ -1346,7 +1366,7 @@ public class StatsRulesProcFactory {
}
// update stats, but don't update NDV as it will not change
- updateStats(stats, cardinality, true, gop, false);
+ updateStats(stats, cardinality, true, gop);
} else {
// NO COLUMN STATS
@@ -1426,7 +1446,7 @@ public class StatsRulesProcFactory {
}
}
- stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, (Operator<?>) gop);
+ stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, gop);
gop.setStatistics(stats);
if (LOG.isDebugEnabled() && stats != null) {
@@ -1577,7 +1597,6 @@ public class StatsRulesProcFactory {
*/
public static class JoinStatsRule extends FilterStatsRule implements NodeProcessor {
-
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
@@ -1604,7 +1623,7 @@ public class StatsRulesProcFactory {
// there could be case where join operators input are not RS e.g.
// map join with Spark. Since following estimation of statistics relies on join operators having it inputs as
// reduced sink it will not work for such cases. So we should not try to estimate stats
- if(allSatisfyPreCondition) {
+ if (allSatisfyPreCondition) {
for (int pos = 0; pos < parents.size(); pos++) {
if (!(jop.getParentOperators().get(pos) instanceof ReduceSinkOperator)) {
allSatisfyPreCondition = false;
@@ -1706,6 +1725,7 @@ public class StatsRulesProcFactory {
String key = ci.getInternalName();
ExprNodeDesc end = colExprMap.get(key);
if (end instanceof ExprNodeColumnDesc) {
+ aspCtx.addAffectedColumn((ExprNodeColumnDesc) end);
String colName = ((ExprNodeColumnDesc) end).getColumn();
int pos = jop.getConf().getReversedExprs().get(key);
ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName);
@@ -1722,23 +1742,23 @@ public class StatsRulesProcFactory {
// reason we compute interim row count, where join type isn't considered, is because later
// it will be used to estimate num nulls
- long interimRowCount = inferredRowCount !=-1 ? inferredRowCount
- :computeRowCountAssumingInnerJoin(rowCounts, denom, jop);
+ long interimRowCount = inferredRowCount != -1 ? inferredRowCount
+ : computeRowCountAssumingInnerJoin(rowCounts, denom, jop);
// final row computation will consider join type
- long joinRowCount = inferredRowCount !=-1 ? inferredRowCount
- :computeFinalRowCount(rowCounts, interimRowCount, jop);
+ long joinRowCount = inferredRowCount != -1 ? inferredRowCount
+ : computeFinalRowCount(rowCounts, interimRowCount, jop);
updateColStats(conf, stats, interimRowCount, joinRowCount, jop, rowCountParents);
// evaluate filter expression and update statistics
if (joinRowCount != -1 && jop.getConf().getNoOuterJoin() &&
- jop.getConf().getResidualFilterExprs() != null &&
- !jop.getConf().getResidualFilterExprs().isEmpty()) {
+ jop.getConf().getResidualFilterExprs() != null &&
+ !jop.getConf().getResidualFilterExprs().isEmpty()) {
ExprNodeDesc pred;
if (jop.getConf().getResidualFilterExprs().size() > 1) {
pred = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
- FunctionRegistry.getGenericUDFForAnd(),
- jop.getConf().getResidualFilterExprs());
+ FunctionRegistry.getGenericUDFForAnd(),
+ jop.getConf().getResidualFilterExprs());
} else {
pred = jop.getConf().getResidualFilterExprs().get(0);
}
@@ -1779,8 +1799,8 @@ public class StatsRulesProcFactory {
// Update cross size
long newCrossRowCount = StatsUtils.safeMult(crossRowCount, rowCount);
long newCrossDataSize = StatsUtils.safeAdd(
- StatsUtils.safeMult(crossDataSize, rowCount),
- StatsUtils.safeMult(dataSize, crossRowCount));
+ StatsUtils.safeMult(crossDataSize, rowCount),
+ StatsUtils.safeMult(dataSize, crossRowCount));
crossRowCount = newCrossRowCount;
crossDataSize = newCrossDataSize;
// Update largest relation
@@ -1800,7 +1820,7 @@ public class StatsRulesProcFactory {
cartesianProduct = keyExprs.size() == 0;
} else if (jop instanceof AbstractMapJoinOperator) {
AbstractMapJoinOperator<? extends MapJoinDesc> mjop =
- (AbstractMapJoinOperator<? extends MapJoinDesc>) jop;
+ (AbstractMapJoinOperator<? extends MapJoinDesc>) jop;
List<ExprNodeDesc> keyExprs = mjop.getConf().getKeys().values().iterator().next();
cartesianProduct = keyExprs.size() == 0;
}
@@ -1824,20 +1844,20 @@ public class StatsRulesProcFactory {
// evaluate filter expression and update statistics
if (jop.getConf().getNoOuterJoin() &&
- jop.getConf().getResidualFilterExprs() != null &&
- !jop.getConf().getResidualFilterExprs().isEmpty()) {
+ jop.getConf().getResidualFilterExprs() != null &&
+ !jop.getConf().getResidualFilterExprs().isEmpty()) {
long joinRowCount = newNumRows;
ExprNodeDesc pred;
if (jop.getConf().getResidualFilterExprs().size() > 1) {
pred = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
- FunctionRegistry.getGenericUDFForAnd(),
- jop.getConf().getResidualFilterExprs());
+ FunctionRegistry.getGenericUDFForAnd(),
+ jop.getConf().getResidualFilterExprs());
} else {
pred = jop.getConf().getResidualFilterExprs().get(0);
}
// evaluate filter expression and update statistics
- newNumRows = evaluateExpression(wcStats, pred,
- aspCtx, jop.getSchema().getColumnNames(), jop, wcStats.getNumRows());
+ newNumRows = evaluateExpression(wcStats, pred,
+ aspCtx, jop.getSchema().getColumnNames(), jop, wcStats.getNumRows());
// update only the basic statistics in the absence of column statistics
if (newNumRows <= joinRowCount) {
updateStats(wcStats, newNumRows, false, jop);
@@ -1998,7 +2018,7 @@ public class StatsRulesProcFactory {
// For the above complex operator tree,
// selectivity(JOIN) = selectivity(RS-1) * selectivity(RS-2) and
// selectivity(RS-3) = numRows(RS-3)/numRows(JOIN) * selectivity(JOIN)
- while(multiParentOp == null) {
+ while (multiParentOp == null) {
if (op.getParentOperators().size() > 1) {
multiParentOp = op;
} else {
@@ -2013,28 +2033,28 @@ public class StatsRulesProcFactory {
// if it is two way left outer or right outer join take selectivity only for
// corresponding branch since only that branch will factor is the reduction
- if(multiParentOp instanceof JoinOperator) {
- JoinOperator jop = ((JoinOperator)multiParentOp);
+ if (multiParentOp instanceof JoinOperator) {
+ JoinOperator jop = ((JoinOperator) multiParentOp);
isSelComputed = true;
// check for two way join
- if(jop.getConf().getConds().length == 1) {
- switch(jop.getConf().getCondsList().get(0).getType()) {
- case JoinDesc.LEFT_OUTER_JOIN:
- selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0));
- break;
- case JoinDesc.RIGHT_OUTER_JOIN:
- selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1));
- break;
- default:
- // for rest of the join type we will take min of the reduction.
- float selMultiParentLeft = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0));
- float selMultiParentRight = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1));
- selMultiParent = Math.min(selMultiParentLeft, selMultiParentRight);
+ if (jop.getConf().getConds().length == 1) {
+ switch (jop.getConf().getCondsList().get(0).getType()) {
+ case JoinDesc.LEFT_OUTER_JOIN:
+ selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0));
+ break;
+ case JoinDesc.RIGHT_OUTER_JOIN:
+ selMultiParent *= getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1));
+ break;
+ default:
+ // for rest of the join type we will take min of the reduction.
+ float selMultiParentLeft = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(0));
+ float selMultiParentRight = getSelectivitySimpleTree(multiParentOp.getParentOperators().get(1));
+ selMultiParent = Math.min(selMultiParentLeft, selMultiParentRight);
}
}
}
- if(!isSelComputed) {
+ if (!isSelComputed) {
for (Operator<? extends OperatorDesc> parent : multiParentOp.getParentOperators()) {
// In the above example, TS-1 -> RS-1 and TS-2 -> RS-2 are simple trees
selMultiParent *= getSelectivitySimpleTree(parent);
@@ -2072,7 +2092,7 @@ public class StatsRulesProcFactory {
ColStatistics cs = rsOp.getStatistics().getColumnStatisticsFromColName(joinCol);
if (cs != null && !cs.isPrimaryKey()) {
if (StatsUtils.inferForeignKey(csPK, cs)) {
- result.put(i,cs);
+ result.put(i, cs);
}
}
}
@@ -2111,7 +2131,7 @@ public class StatsRulesProcFactory {
}
private boolean isJoinKey(final String columnName,
- final ExprNodeDesc[][] joinKeys) {
+ final ExprNodeDesc[][] joinKeys) {
for (int i = 0; i < joinKeys.length; i++) {
for (ExprNodeDesc expr : Arrays.asList(joinKeys[i])) {
@@ -2133,36 +2153,34 @@ public class StatsRulesProcFactory {
return;
}
-
long oldNumNulls = colStats.getNumNulls();
long newNumNulls = Math.min(newNumRows, oldNumNulls);
JoinCondDesc joinCond = jop.getConf().getConds()[0];
switch (joinCond.getType()) {
- case JoinDesc.LEFT_OUTER_JOIN :
+ case JoinDesc.LEFT_OUTER_JOIN:
//if this column is coming from right input only then we update num nulls
- if(pos == joinCond.getRight()
+ if (pos == joinCond.getRight()
&& interimNumRows != newNumRows) {
// interim row count can not be less due to containment
// assumption in join cardinality computation
- assert(newNumRows > interimNumRows);
- if(isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) {
- newNumNulls = Math.min(newNumRows, (newNumRows-interimNumRows));
- }
- else {
- newNumNulls = Math.min(newNumRows, oldNumNulls + (newNumRows-interimNumRows));
+ assert (newNumRows > interimNumRows);
+ if (isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) {
+ newNumNulls = Math.min(newNumRows, (newNumRows - interimNumRows));
+ } else {
+ newNumNulls = Math.min(newNumRows, oldNumNulls + (newNumRows - interimNumRows));
}
}
break;
case JoinDesc.RIGHT_OUTER_JOIN:
- if(pos == joinCond.getLeft()
+ if (pos == joinCond.getLeft()
&& interimNumRows != newNumRows) {
// interim row count can not be less due to containment
// assumption in join cardinality computation
// interimNumRows represent number of matches for join keys on two sides.
// newNumRows-interimNumRows represent number of non-matches.
- assert(newNumRows > interimNumRows);
+ assert (newNumRows > interimNumRows);
if (isJoinKey(colStats.getColumnName(), jop.getConf().getJoinKeys())) {
newNumNulls = Math.min(newNumRows, (newNumRows - interimNumRows));
@@ -2194,7 +2212,7 @@ public class StatsRulesProcFactory {
if (newNumRows < 0) {
LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. "
- + newNumRows + " rows will be set to Long.MAX_VALUE");
+ + newNumRows + " rows will be set to Long.MAX_VALUE");
}
if (newNumRows == 0) {
LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. "
@@ -2242,7 +2260,8 @@ public class StatsRulesProcFactory {
}
}
if (neededColumns.size() != 0) {
- int restColumnsDefaultSize = StatsUtils.estimateRowSizeFromSchema(conf, jop.getSchema().getSignature(), neededColumns);
+ int restColumnsDefaultSize =
+ StatsUtils.estimateRowSizeFromSchema(conf, jop.getSchema().getSignature(), neededColumns);
newDataSize = StatsUtils.safeAdd(newDataSize, StatsUtils.safeMult(restColumnsDefaultSize, newNumRows));
}
stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
@@ -2258,20 +2277,20 @@ public class StatsRulesProcFactory {
case JoinDesc.INNER_JOIN:
// only dealing with special join types here.
break;
- case JoinDesc.LEFT_OUTER_JOIN :
+ case JoinDesc.LEFT_OUTER_JOIN:
// all rows from left side will be present in resultset
result = Math.max(rowCountParents.get(joinCond.getLeft()), result);
break;
- case JoinDesc.RIGHT_OUTER_JOIN :
+ case JoinDesc.RIGHT_OUTER_JOIN:
// all rows from right side will be present in resultset
result = Math.max(rowCountParents.get(joinCond.getRight()), result);
break;
- case JoinDesc.FULL_OUTER_JOIN :
+ case JoinDesc.FULL_OUTER_JOIN:
// all rows from both side will be present in resultset
result = Math.max(StatsUtils.safeAdd(rowCountParents.get(joinCond.getRight()),
rowCountParents.get(joinCond.getLeft())), result);
break;
- case JoinDesc.LEFT_SEMI_JOIN :
+ case JoinDesc.LEFT_SEMI_JOIN:
// max # of rows = rows from left side
result = Math.min(rowCountParents.get(joinCond.getLeft()), result);
break;
@@ -2282,6 +2301,7 @@ public class StatsRulesProcFactory {
}
return result;
}
+
private long computeRowCountAssumingInnerJoin(List<Long> rowCountParents, long denom,
CommonJoinOperator<? extends JoinDesc> join) {
double factor = 0.0d;
@@ -2405,7 +2425,7 @@ public class StatsRulesProcFactory {
if (satisfyPrecondition(parentStats)) {
Statistics stats = parentStats.clone();
List<ColStatistics> colStats = StatsUtils.getColStatisticsUpdatingTableAlias(
- parentStats, lop.getSchema());
+ parentStats, lop.getSchema());
stats.setColumnStats(colStats);
// if limit is greater than available rows then do not update
@@ -2413,7 +2433,7 @@ public class StatsRulesProcFactory {
if (limit <= parentStats.getNumRows()) {
updateStats(stats, limit, true, lop);
}
- stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, (Operator<?>) lop);
+ stats = applyRuntimeStats(aspCtx.getParseContext().getContext(), stats, lop);
lop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -2426,7 +2446,7 @@ public class StatsRulesProcFactory {
// based on average row size
limit = StatsUtils.getMaxIfOverflow(limit);
Statistics wcStats = parentStats.scaleToRowCount(limit, true);
- wcStats = applyRuntimeStats(aspCtx.getParseContext().getContext(), wcStats, (Operator<?>) lop);
+ wcStats = applyRuntimeStats(aspCtx.getParseContext().getContext(), wcStats, lop);
lop.setStatistics(wcStats);
if (LOG.isDebugEnabled()) {
LOG.debug("[1] STATS-" + lop.toString() + ": " + wcStats.extendedToString());
@@ -2448,7 +2468,8 @@ public class StatsRulesProcFactory {
public static class ReduceSinkStatsRule extends DefaultStatsRule implements NodeProcessor {
@Override
- public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
+ throws SemanticException {
ReduceSinkOperator rop = (ReduceSinkOperator) nd;
Operator<? extends OperatorDesc> parent = rop.getParentOperators().get(0);
Statistics parentStats = parent.getStatistics();
@@ -2488,7 +2509,7 @@ public class StatsRulesProcFactory {
outStats.setColumnStats(colStats);
}
- outStats = applyRuntimeStats(aspCtx.getParseContext().getContext(), outStats, (Operator<?>) rop);
+ outStats = applyRuntimeStats(aspCtx.getParseContext().getContext(), outStats, rop);
rop.setStatistics(outStats);
if (LOG.isDebugEnabled()) {
LOG.debug("[0] STATS-" + rop.toString() + ": " + outStats.extendedToString());
@@ -2563,7 +2584,8 @@ public class StatsRulesProcFactory {
}
stats.updateColumnStatsState(parentStats.getColumnStatsState());
- List<ColStatistics> colStats = StatsUtils.getColStatisticsFromExprMap(hconf, parentStats, op.getColumnExprMap(), op.getSchema());
+ List<ColStatistics> colStats =
+ StatsUtils.getColStatisticsFromExprMap(hconf, parentStats, op.getColumnExprMap(), op.getSchema());
stats.addToColumnStats(colStats);
if (LOG.isDebugEnabled()) {
@@ -2626,7 +2648,6 @@ public class StatsRulesProcFactory {
return new DefaultStatsRule();
}
-
/**
* Update the basic statistics of the statistics object based on the row number
* @param stats
@@ -2638,12 +2659,12 @@ public class StatsRulesProcFactory {
*/
static void updateStats(Statistics stats, long newNumRows,
boolean useColStats, Operator<? extends OperatorDesc> op) {
- updateStats(stats, newNumRows, useColStats, op, true);
+ updateStats(stats, newNumRows, useColStats, op, Collections.EMPTY_SET);
}
static void updateStats(Statistics stats, long newNumRows,
boolean useColStats, Operator<? extends OperatorDesc> op,
- boolean updateNDV) {
+ Set<String> affectedColumns) {
if (newNumRows < 0) {
LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. "
@@ -2667,7 +2688,7 @@ public class StatsRulesProcFactory {
long oldDV = cs.getCountDistint();
long newNumNulls = Math.round(ratio * oldNumNulls);
cs.setNumNulls(newNumNulls);
- if (updateNDV) {
+ if (affectedColumns.contains(cs.getColumnName())) {
long newDV = oldDV;
// if ratio is greater than 1, then number of rows increases. This can happen
@@ -2678,6 +2699,10 @@ public class StatsRulesProcFactory {
newDV = (long) Math.ceil(ratio * oldDV);
}
cs.setCountDistint(newDV);
+ oldDV = newDV;
+ }
+ if (oldDV > newNumRows) {
+ cs.setCountDistint(newNumRows);
}
}
stats.setColumnStats(colStats);
@@ -2694,7 +2719,6 @@ public class StatsRulesProcFactory {
&& !stats.getColumnStatsState().equals(Statistics.State.NONE);
}
-
private static Statistics applyRuntimeStats(Context context, Statistics stats, Operator<?> op) {
if (!((HiveConf) context.getConf()).getBoolVar(ConfVars.HIVE_QUERY_REEXECUTION_ENABLED)) {
return stats;
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q b/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q
index c6e9a1a..6c4ba33 100644
--- a/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q
+++ b/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q
@@ -43,13 +43,15 @@ alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set(
'rawDataSize'='1234567');
alter table x1_date_dim update statistics set(
-'numRows'='56',
+'numRows'='28',
'rawDataSize'='81449');
alter table x1_item update statistics set(
'numRows'='18',
'rawDataSize'='32710');
+-- note: it is important that the below query uses DPP!
+
explain
select count(*) cnt
from
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/queries/clientpositive/reopt_semijoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/reopt_semijoin.q b/ql/src/test/queries/clientpositive/reopt_semijoin.q
index 0eacb8a..75db410 100644
--- a/ql/src/test/queries/clientpositive/reopt_semijoin.q
+++ b/ql/src/test/queries/clientpositive/reopt_semijoin.q
@@ -38,7 +38,7 @@ alter table x1_store_sales update statistics set(
'rawDataSize'='1234567');
alter table x1_date_dim update statistics set(
-'numRows'='56',
+'numRows'='28',
'rawDataSize'='81449');
@@ -51,6 +51,7 @@ set hive.tez.min.bloom.filter.entries=1;
set hive.tez.bloom.filter.factor=1.0f;
set hive.explain.user=false;
+-- note: this plan should involve a semijoin reduction
explain
select sum(s.ss_item_sk)
from
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/queries/clientpositive/stat_estimate_drill.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/stat_estimate_drill.q b/ql/src/test/queries/clientpositive/stat_estimate_drill.q
new file mode 100644
index 0000000..2b711d6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stat_estimate_drill.q
@@ -0,0 +1,28 @@
+set hive.explain.user=true;
+set hive.strict.checks.cartesian.product=false;
+set hive.stats.fetch.column.stats=true;
+
+drop table if exists t1;
+drop table if exists t8;
+
+create table t1 (a integer);
+create table t3 (a integer,b integer,c integer);
+
+insert into t1 values (1),(2),(3),(4),(5),(6),(7),(8),(9),(0);
+insert into t3
+ select x1.a as a,x2.a as b,x3.a as c from
+ t1 x1
+ join t1 x2
+ join t1 x3;
+
+analyze table t3 compute statistics for columns;
+
+explain analyze select sum(a) from t3 where b in (2,3) group by b;
+
+explain analyze select sum(a) from t3 where a=1 or a=2 group by b;
+explain analyze select sum(a) from t3 where a=1 or (a=2 and b=3) group by b;
+explain analyze select sum(a) from t3 where a=1 group by b;
+explain analyze select sum(a) from t3 where a=1 and b=2 group by b;
+explain analyze select sum(a) from t3 where a=1 and b=2 and c=3 group by b;
+
+explain analyze select sum(a) from t3 where (a=1 and b=2) or (a=2 and b=3) or (a=3 and b=4) group by b;
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/queries/clientpositive/stat_estimate_related_col.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/stat_estimate_related_col.q b/ql/src/test/queries/clientpositive/stat_estimate_related_col.q
index 52da2f7..54deb5b 100644
--- a/ql/src/test/queries/clientpositive/stat_estimate_related_col.q
+++ b/ql/src/test/queries/clientpositive/stat_estimate_related_col.q
@@ -1,3 +1,6 @@
+-- disable cbo because calcite can see thru these test cases; the goal here is to test the annotation processing
+set hive.cbo.enable=false;
+
set hive.explain.user=true;
set hive.strict.checks.cartesian.product=false;
set hive.stats.fetch.column.stats=true;
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
index 83bb65e..3ffb002 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out
@@ -124,9 +124,9 @@ STAGE PLANS:
Statistics: Num rows: 2098 Data size: 16744 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (((t = 10Y) and (si = 11S)) or ((t = 11Y) and (si = 12S)) or ((t = 12Y) and (si = 13S)) or ((t = 13Y) and (si = 14S)) or ((t = 14Y) and (si = 15S)) or ((t = 15Y) and (si = 16S)) or ((t = 16Y) and (si = 17S)) or ((t = 17Y) and (si = 18S)) or ((t = 1Y) and (si = 2S)) or ((t = 27Y) and (si = 28S)) or ((t = 2Y) and (si = 3S)) or ((t = 37Y) and (si = 38S)) or ((t = 3Y) and (si = 4S)) or ((t = 47Y) and (si = 48S)) or ((t = 4Y) and (si = 5S)) or ((t = 52Y) and (si = 53S)) or ((t = 5Y) and (si = 6S)) or ((t = 6Y) and (si = 7S)) or ((t = 7Y) and (si = 8S)) or ((t = 9Y) and (si = 10S))) (type: boolean)
- Statistics: Num rows: 160 Data size: 1280 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 20 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
- Statistics: Num rows: 160 Data size: 1280 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 20 Data size: 160 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
index f937834..c0b6707 100644
--- a/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
+++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
@@ -1100,11 +1100,11 @@ STAGE PLANS:
0 key (type: int)
1 key (type: int)
outputColumnNames: $f1, $f10
- Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ($f1 * $f10) (type: bigint)
outputColumnNames: $f4
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: $sum0($f4)
mode: hash
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/llap/auto_join29.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/auto_join29.q.out b/ql/src/test/results/clientpositive/llap/auto_join29.q.out
index 9db7cb3..55f3797 100644
--- a/ql/src/test/results/clientpositive/llap/auto_join29.q.out
+++ b/ql/src/test/results/clientpositive/llap/auto_join29.q.out
@@ -696,25 +696,25 @@ STAGE PLANS:
1 key (type: string)
2 key (type: string)
outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11
- Statistics: Num rows: 137 Data size: 73158 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 137 Data size: 73158 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 137 Data size: 73158 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 137 Data size: 73158 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 137 Data size: 73158 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -1316,25 +1316,25 @@ STAGE PLANS:
1 key (type: string)
2 key (type: string)
outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11
- Statistics: Num rows: 415 Data size: 221610 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 262 Data size: 139908 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 415 Data size: 221610 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 262 Data size: 139908 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 415 Data size: 221610 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 262 Data size: 139908 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 415 Data size: 221610 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 262 Data size: 139908 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 415 Data size: 221610 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 262 Data size: 139908 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -2576,25 +2576,25 @@ STAGE PLANS:
1 key (type: string)
2 key (type: string)
outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -2696,25 +2696,25 @@ STAGE PLANS:
1 key (type: string)
2 key (type: string)
outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -3274,15 +3274,15 @@ STAGE PLANS:
input vertices:
1 Map 3
2 Map 4
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized, llap
LLAP IO: no inputs
Map 3
@@ -3325,10 +3325,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 136 Data size: 72624 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 87 Data size: 46458 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -3409,15 +3409,15 @@ STAGE PLANS:
input vertices:
0 Map 1
2 Map 4
- Statistics: Num rows: 414 Data size: 221076 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 166 Data size: 88644 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 414 Data size: 221076 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 166 Data size: 88644 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 414 Data size: 221076 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 166 Data size: 88644 Basic stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized, llap
LLAP IO: no inputs
Map 4
@@ -3443,10 +3443,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 414 Data size: 221076 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 166 Data size: 88644 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 414 Data size: 221076 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 166 Data size: 88644 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -3528,15 +3528,15 @@ STAGE PLANS:
input vertices:
1 Map 3
2 Map 4
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string)
sort order: ++++++
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized, llap
LLAP IO: no inputs
Map 3
@@ -3579,10 +3579,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: string), KEY.reducesinkkey5 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 135 Data size: 72090 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 55 Data size: 29370 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/llap/auto_smb_mapjoin_14.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/auto_smb_mapjoin_14.q.out b/ql/src/test/results/clientpositive/llap/auto_smb_mapjoin_14.q.out
index 21273c1..1c3c554 100644
--- a/ql/src/test/results/clientpositive/llap/auto_smb_mapjoin_14.q.out
+++ b/ql/src/test/results/clientpositive/llap/auto_smb_mapjoin_14.q.out
@@ -582,7 +582,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -701,7 +701,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -844,7 +844,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -975,7 +975,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1207,7 +1207,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1332,7 +1332,7 @@ STAGE PLANS:
0 _col0 (type: int)
1 _col0 (type: int)
2 _col0 (type: int)
- Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1459,7 +1459,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_10.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_10.q.out b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_10.q.out
index 9e07f1c..3f1d832 100644
--- a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_10.q.out
+++ b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_10.q.out
@@ -146,7 +146,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 9 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -260,7 +260,7 @@ STAGE PLANS:
keys: key (type: int)
mode: final
outputColumnNames: _col0
- Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
Merge Join Operator
condition map:
Inner Join 0 to 1
http://git-wip-us.apache.org/repos/asf/hive/blob/5c02fee2/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_9.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_9.q.out b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_9.q.out
index 4ffa7e2..e9a8b01 100644
--- a/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_9.q.out
+++ b/ql/src/test/results/clientpositive/llap/auto_sortmerge_join_9.q.out
@@ -701,7 +701,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -820,7 +820,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -963,7 +963,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1094,7 +1094,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1326,7 +1326,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1429,7 +1429,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1556,7 +1556,7 @@ STAGE PLANS:
0 _col0 (type: int)
1 _col0 (type: int)
2 _col0 (type: int)
- Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -1683,7 +1683,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -2410,7 +2410,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -2529,7 +2529,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -2672,7 +2672,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -2803,7 +2803,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -2908,7 +2908,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -3011,7 +3011,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -3138,7 +3138,7 @@ STAGE PLANS:
0 _col0 (type: int)
1 _col0 (type: int)
2 _col0 (type: int)
- Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash
@@ -3265,7 +3265,7 @@ STAGE PLANS:
keys:
0 _col0 (type: int)
1 _col0 (type: int)
- Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
mode: hash