You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2015/05/27 06:20:50 UTC
[2/2] hive git commit: HIVE-10812 : Scaling PK/FK's selectivity for stats annotation (Pengcheng Xiong via John Pullokkaran)

HIVE-10812 : Scaling PK/FK's selectivity for stats annotation (Pengcheng Xiong via John Pullokkaran)

Signed-off-by: Ashutosh Chauhan <ha...@apache.org>

Conflicts:
	ql/src/test/results/clientpositive/udf_crc32.q.out
	ql/src/test/results/clientpositive/udf_sha1.q.out


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5f8b7016
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5f8b7016
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5f8b7016

Branch: refs/heads/branch-1.2
Commit: 5f8b7016b7d48343e34a4c9baaf971d2279625db
Parents: 5bb039c
Author: Pengcheng Xiong <px...@hortonworks.com>
Authored: Tue May 26 11:45:00 2015 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Tue May 26 21:20:11 2015 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hive/ql/exec/Utilities.java   |   2 +
 .../stats/annotation/StatsRulesProcFactory.java | 155 +++++++++++--------
 .../apache/hadoop/hive/ql/stats/StatsUtils.java |  35 ++++-
 .../clientpositive/annotate_stats_join.q.out    |   8 +-
 .../annotate_stats_join_pkfk.q.out              |  42 ++---
 .../spark/annotate_stats_join.q.out             |  10 +-
 .../tez/vector_null_projection.q.out            |  10 ++
 .../results/clientpositive/vector_join30.q.out  |  24 +++
 8 files changed, 194 insertions(+), 92 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index ad5c8f8..9582e3a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -140,6 +140,7 @@ import org.apache.hadoop.hive.ql.metadata.InputEstimator;
 import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
 import org.apache.hadoop.hive.ql.plan.BaseWork;
 import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
 import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
@@ -1095,6 +1096,7 @@ public final class Utilities {
       kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
       removeField(kryo, Operator.class, "colExprMap");
       removeField(kryo, ColumnInfo.class, "objectInspector");
+      removeField(kryo, AbstractOperatorDesc.class, "statistics");
       return kryo;
     };
   };

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 571c050..0982059 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -77,6 +77,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.Stack;
 
@@ -1061,7 +1062,6 @@ public class StatsRulesProcFactory {
           numAttr = keyExprs.size();
 
           // infer PK-FK relationship in single attribute join case
-          pkfkInferred = false;
           inferPKFKRelationship();
           // get the join keys from parent ReduceSink operators
           for (int pos = 0; pos < parents.size(); pos++) {
@@ -1197,53 +1197,42 @@ public class StatsRulesProcFactory {
 
     private void inferPKFKRelationship() {
       if (numAttr == 1) {
-        List<Integer> parentsWithPK = getPrimaryKeyCandidates(parents);
-
-        // in case of fact to many dimensional tables join, the join key in fact table will be
-        // mostly foreign key which will have corresponding primary key in dimension table.
-        // The selectivity of fact table in that case will be product of all selectivities of
-        // dimension tables (assumes conjunctivity)
-        for (Integer id : parentsWithPK) {
-          ColStatistics csPK = null;
-          Operator<? extends OperatorDesc> parent = parents.get(id);
-          for (ColStatistics cs : parent.getStatistics().getColumnStats()) {
-            if (cs.isPrimaryKey()) {
-              csPK = cs;
-              break;
-            }
-          }
+        // If numAttr is 1, this means we join on one single key column.
+        Map<Integer, ColStatistics> parentsWithPK = getPrimaryKeyCandidates(parents);
 
-          // infer foreign key candidates positions
-          List<Integer> parentsWithFK = getForeignKeyCandidates(parents, csPK);
-          if (parentsWithFK.size() == 1 &&
-              parentsWithFK.size() + parentsWithPK.size() == parents.size()) {
-            Operator<? extends OperatorDesc> parentWithFK = parents.get(parentsWithFK.get(0));
-            List<Float> parentsSel = getSelectivity(parents, parentsWithPK);
-            Float prodSelectivity = 1.0f;
-            for (Float selectivity : parentsSel) {
-              prodSelectivity *= selectivity;
-            }
-            newNumRows = (long) Math.ceil(
-                parentWithFK.getStatistics().getNumRows() * prodSelectivity);
-            pkfkInferred = true;
+        // We only allow one single PK.
+        if (parentsWithPK.size() != 1) {
+          LOG.debug("STATS-" + jop.toString() + ": detects multiple PK parents.");
+          return;
+        }
+        Integer pkPos = parentsWithPK.keySet().iterator().next();
+        ColStatistics csPK = parentsWithPK.values().iterator().next();
 
-            // some debug information
-            if (isDebugEnabled) {
-              List<String> parentIds = Lists.newArrayList();
+        // infer foreign key candidates positions
+        Map<Integer, ColStatistics> csFKs = getForeignKeyCandidates(parents, csPK);
 
-              // print primary key containing parents
-              for (Integer i : parentsWithPK) {
-                parentIds.add(parents.get(i).toString());
-              }
-              LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds);
-              parentIds.clear();
+        // we allow multiple foreign keys (snowflake schema)
+        // csfKs.size() + 1 == parents.size() means we have a single PK and all
+        // the rest ops are FKs.
+        if (csFKs.size() + 1 == parents.size()) {
+          getSelectivity(parents, pkPos, csPK, csFKs);
 
-              // print foreign key containing parents
-              for (Integer i : parentsWithFK) {
-                parentIds.add(parents.get(i).toString());
-              }
-              LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds);
+          // some debug information
+          if (isDebugEnabled) {
+            List<String> parentIds = Lists.newArrayList();
+
+            // print primary key containing parents
+            for (Integer i : parentsWithPK.keySet()) {
+              parentIds.add(parents.get(i).toString());
             }
+            LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds);
+            parentIds.clear();
+
+            // print foreign key containing parents
+            for (Integer i : csFKs.keySet()) {
+              parentIds.add(parents.get(i).toString());
+            }
+            LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds);
           }
         }
       }
@@ -1251,19 +1240,63 @@ public class StatsRulesProcFactory {
 
     /**
      * Get selectivity of reduce sink operators.
-     * @param ops - reduce sink operators
-     * @param opsWithPK - reduce sink operators with primary keys
-     * @return - list of selectivity for primary key containing operators
+     * @param csPK - ColStatistics for a single primary key
+     * @param csFKs - ColStatistics for multiple foreign keys
      */
-    private List<Float> getSelectivity(List<Operator<? extends OperatorDesc>> ops,
-        List<Integer> opsWithPK) {
-      List<Float> result = Lists.newArrayList();
-      for (Integer idx : opsWithPK) {
-        Operator<? extends OperatorDesc> op = ops.get(idx);
-        float selectivity = getSelectivitySimpleTree(op);
-        result.add(selectivity);
+    private void getSelectivity(List<Operator<? extends OperatorDesc>> ops, Integer pkPos, ColStatistics csPK,
+        Map<Integer, ColStatistics> csFKs) {
+      this.pkfkInferred = true;
+      double pkfkSelectivity = Double.MAX_VALUE;
+      int fkInd = -1;
+      // 1. We iterate through all the operators that have candidate FKs and
+      // choose the FK that has the minimum selectivity. We assume that PK and this FK
+      // have the PK-FK relationship. This is heuristic and can be
+      // improved later.
+      for (Entry<Integer, ColStatistics> entry : csFKs.entrySet()) {
+        int pos = entry.getKey();
+        Operator<? extends OperatorDesc> opWithPK = ops.get(pkPos);
+        double selectivity = getSelectivitySimpleTree(opWithPK);
+        double selectivityAdjustment = StatsUtils.getScaledSelectivity(csPK, entry.getValue());
+        selectivity = selectivityAdjustment * selectivity > 1 ? selectivity : selectivityAdjustment
+            * selectivity;
+        if (selectivity < pkfkSelectivity) {
+          pkfkSelectivity = selectivity;
+          fkInd = pos;
+        }
+      }
+      long newrows = 1;
+      List<Long> rowCounts = Lists.newArrayList();
+      List<Long> distinctVals = Lists.newArrayList();
+      // 2. We then iterate through all the operators that have candidate FKs again.
+      // We assume the PK is first joining with the FK that we just selected.
+      // And we apply the PK-FK relationship when we compute the newrows and ndv.
+      // After that, we join the result with all the other FKs.
+      // We do not assume the PK-FK relationship anymore and just compute the
+      // row count using the classic formula.
+      for (Entry<Integer, ColStatistics> entry : csFKs.entrySet()) {
+        int pos = entry.getKey();
+        ColStatistics csFK = entry.getValue();
+        ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
+        Statistics parentStats = parent.getStatistics();
+        if (fkInd == pos) {
+          // 2.1 This is the new number of rows after PK is joining with FK
+          newrows = (long) Math.ceil(parentStats.getNumRows() * pkfkSelectivity);
+          rowCounts.add(newrows);
+          // 2.1 The ndv is the minimum of the PK and the FK.
+          distinctVals.add(Math.min(csFK.getCountDistint(), csPK.getCountDistint()));
+        } else {
+          // 2.2 All the other FKs.
+          rowCounts.add(parentStats.getNumRows());
+          distinctVals.add(csFK.getCountDistint());
+        }
+      }
+      if (csFKs.size() == 1) {
+        // there is only one FK
+        this.newNumRows = newrows;
+      } else {
+        // there is more than one FK
+        this.newNumRows = this.computeNewRowCount(rowCounts, getDenominator(distinctVals));
       }
-      return result;
     }
 
     private float getSelectivitySimpleTree(Operator<? extends OperatorDesc> op) {
@@ -1323,11 +1356,11 @@ public class StatsRulesProcFactory {
      * primary key range (inferred as foreign keys).
      * @param ops - operators
      * @param csPK - column statistics of primary key
-     * @return - list of foreign key containing parent ids
+     * @return - a map which contains position ids and the corresponding column statistics
      */
-    private List<Integer> getForeignKeyCandidates(List<Operator<? extends OperatorDesc>> ops,
+    private Map<Integer, ColStatistics> getForeignKeyCandidates(List<Operator<? extends OperatorDesc>> ops,
         ColStatistics csPK) {
-      List<Integer> result = Lists.newArrayList();
+      Map<Integer, ColStatistics> result = new HashMap<Integer, ColStatistics>();
       if (csPK == null || ops == null) {
         return result;
       }
@@ -1343,7 +1376,7 @@ public class StatsRulesProcFactory {
               ColStatistics cs = rsOp.getStatistics().getColumnStatisticsFromColName(joinCol);
               if (cs != null && !cs.isPrimaryKey()) {
                 if (StatsUtils.inferForeignKey(csPK, cs)) {
-                  result.add(i);
+                  result.put(i,cs);
                 }
               }
             }
@@ -1358,8 +1391,8 @@ public class StatsRulesProcFactory {
      * @param ops - operators
      * @return - list of primary key containing parent ids
      */
-    private List<Integer> getPrimaryKeyCandidates(List<Operator<? extends OperatorDesc>> ops) {
-      List<Integer> result = Lists.newArrayList();
+    private Map<Integer, ColStatistics> getPrimaryKeyCandidates(List<Operator<? extends OperatorDesc>> ops) {
+      Map<Integer, ColStatistics> result = new HashMap<Integer, ColStatistics>();
       if (ops != null && !ops.isEmpty()) {
         for (int i = 0; i < ops.size(); i++) {
           Operator<? extends OperatorDesc> op = ops.get(i);
@@ -1371,7 +1404,7 @@ public class StatsRulesProcFactory {
               if (rsOp.getStatistics() != null) {
                 ColStatistics cs = rsOp.getStatistics().getColumnStatisticsFromColName(joinCol);
                 if (cs != null && cs.isPrimaryKey()) {
-                  result.add(i);
+                  result.put(i, cs);
                 }
               }
             }

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 4cd9120..ad481bc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -300,7 +300,10 @@ public class StatsUtils {
   public static void inferAndSetPrimaryKey(long numRows, List<ColStatistics> colStats) {
     if (colStats != null) {
       for (ColStatistics cs : colStats) {
-        if (cs != null && cs.getRange() != null && cs.getRange().minValue != null &&
+        if (cs != null && cs.getCountDistint() >= numRows) {
+          cs.setPrimaryKey(true);
+        }
+        else if (cs != null && cs.getRange() != null && cs.getRange().minValue != null &&
             cs.getRange().maxValue != null) {
           if (numRows ==
               ((cs.getRange().maxValue.longValue() - cs.getRange().minValue.longValue()) + 1)) {
@@ -330,6 +333,36 @@ public class StatsUtils {
     return false;
   }
 
+  /**
+   * Scale selectivity based on key range ratio.
+   * @param csPK - column statistics of primary key
+   * @param csFK - column statistics of potential foreign key
+   * @return
+   */
+  public static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK) {
+    float scaledSelectivity = 1.0f;
+    if (csPK != null && csFK != null) {
+      if (csPK.isPrimaryKey()) {
+        // Use Max-Min Range as NDV gets scaled by selectivity.
+        if (csPK.getRange() != null && csFK.getRange() != null) {
+          long pkRangeDelta = getRangeDelta(csPK.getRange());
+          long fkRangeDelta = getRangeDelta(csFK.getRange());
+          if (fkRangeDelta > 0 && pkRangeDelta > 0 && fkRangeDelta < pkRangeDelta) {
+            scaledSelectivity = (float) pkRangeDelta / (float) fkRangeDelta;
+          }
+        }
+      }
+    }
+    return scaledSelectivity;
+  }
+
+  private static long getRangeDelta(ColStatistics.Range range) {
+    if (range.minValue != null && range.maxValue != null) {
+      return (range.maxValue.longValue() - range.minValue.longValue());
+    }
+    return 0;
+  }
+
   private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) {
     if (range1.minValue != null && range2.minValue != null && range1.maxValue != null &&
         range2.maxValue != null) {

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/test/results/clientpositive/annotate_stats_join.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/annotate_stats_join.q.out b/ql/src/test/results/clientpositive/annotate_stats_join.q.out
index 66e944b..bc44cc3 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_join.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_join.q.out
@@ -202,10 +202,10 @@ STAGE PLANS:
             0 _col1 (type: int)
             1 _col0 (type: int)
           outputColumnNames: _col0, _col1, _col2, _col3, _col4
-          Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 48 Data size: 9312 Basic stats: COMPLETE Column stats: COMPLETE
           File Output Operator
             compressed: false
-            Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 48 Data size: 9312 Basic stats: COMPLETE Column stats: COMPLETE
             table:
                 input format: org.apache.hadoop.mapred.TextInputFormat
                 output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -497,10 +497,10 @@ STAGE PLANS:
             1 _col0 (type: int)
             2 _col1 (type: int)
           outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
-          Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 768 Data size: 225024 Basic stats: COMPLETE Column stats: COMPLETE
           File Output Operator
             compressed: false
-            Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 768 Data size: 225024 Basic stats: COMPLETE Column stats: COMPLETE
             table:
                 input format: org.apache.hadoop.mapred.TextInputFormat
                 output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out b/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
index 66e0e9f..dd70708 100644
--- a/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
+++ b/ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out
@@ -379,14 +379,14 @@ STAGE PLANS:
             0 _col0 (type: int)
             1 _col0 (type: int)
           outputColumnNames: _col1
-          Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 136 Data size: 544 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col1 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 136 Data size: 544 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 111 Data size: 444 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 136 Data size: 544 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -448,14 +448,14 @@ STAGE PLANS:
             0 _col0 (type: int)
             1 _col0 (type: int)
           outputColumnNames: _col2
-          Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: PARTIAL
+          Statistics: Num rows: 131 Data size: 524 Basic stats: COMPLETE Column stats: PARTIAL
           Select Operator
             expressions: _col2 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: PARTIAL
+            Statistics: Num rows: 131 Data size: 524 Basic stats: COMPLETE Column stats: PARTIAL
             File Output Operator
               compressed: false
-              Statistics: Num rows: 107 Data size: 428 Basic stats: COMPLETE Column stats: PARTIAL
+              Statistics: Num rows: 131 Data size: 524 Basic stats: COMPLETE Column stats: PARTIAL
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -517,14 +517,14 @@ STAGE PLANS:
             0 _col0 (type: int)
             1 _col0 (type: int)
           outputColumnNames: _col1
-          Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 393 Data size: 1572 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col1 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 393 Data size: 1572 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 393 Data size: 1572 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -672,14 +672,14 @@ STAGE PLANS:
             1 _col0 (type: int)
             2 _col0 (type: int)
           outputColumnNames: _col1
-          Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col1 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 964 Data size: 3856 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 821 Data size: 3284 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -758,14 +758,14 @@ STAGE PLANS:
             1 _col0 (type: int)
             2 _col0 (type: int)
           outputColumnNames: _col1
-          Statistics: Num rows: 38 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 213 Data size: 852 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col1 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 38 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 213 Data size: 852 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 38 Data size: 152 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 213 Data size: 852 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -844,14 +844,14 @@ STAGE PLANS:
             1 _col0 (type: int)
             2 _col0 (type: int)
           outputColumnNames: _col1
-          Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 508 Data size: 2032 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col1 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 508 Data size: 2032 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 322 Data size: 1288 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 508 Data size: 2032 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -930,14 +930,14 @@ STAGE PLANS:
             1 _col0 (type: int)
             2 _col0 (type: int)
           outputColumnNames: _col2
-          Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col2 (type: int)
             outputColumnNames: _col0
-            Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 321 Data size: 1284 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 273 Data size: 1092 Basic stats: COMPLETE Column stats: COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/test/results/clientpositive/spark/annotate_stats_join.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/annotate_stats_join.q.out b/ql/src/test/results/clientpositive/spark/annotate_stats_join.q.out
index 75ad4e7..032926d 100644
--- a/ql/src/test/results/clientpositive/spark/annotate_stats_join.q.out
+++ b/ql/src/test/results/clientpositive/spark/annotate_stats_join.q.out
@@ -202,14 +202,14 @@ STAGE PLANS:
                   0 deptid (type: int)
                   1 deptid (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col6, _col7
-                Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 48 Data size: 9312 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col6 (type: int), _col7 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                  Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 48 Data size: 9312 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 41 Data size: 7954 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 48 Data size: 9312 Basic stats: COMPLETE Column stats: COMPLETE
                     table:
                         input format: org.apache.hadoop.mapred.TextInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -529,10 +529,10 @@ STAGE PLANS:
                   1 _col0 (type: int)
                   2 _col1 (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
-                Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 768 Data size: 225024 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 658 Data size: 192794 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 768 Data size: 225024 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.TextInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/test/results/clientpositive/tez/vector_null_projection.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vector_null_projection.q.out b/ql/src/test/results/clientpositive/tez/vector_null_projection.q.out
index 9b7b698..6af333d 100644
--- a/ql/src/test/results/clientpositive/tez/vector_null_projection.q.out
+++ b/ql/src/test/results/clientpositive/tez/vector_null_projection.q.out
@@ -104,30 +104,40 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: a
+                  Statistics: Num rows: 1 Data size: 87 Basic stats: COMPLETE Column stats: COMPLETE
                   Select Operator
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                     Select Operator
+                      Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                       Group By Operator
                         keys: null (type: void)
                         mode: hash
                         outputColumnNames: _col0
+                        Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: void)
                           sort order: +
                           Map-reduce partition columns: _col0 (type: void)
+                          Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
         Map 4 
             Map Operator Tree:
                 TableScan
                   alias: b
+                  Statistics: Num rows: 1 Data size: 87 Basic stats: COMPLETE Column stats: COMPLETE
                   Select Operator
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                     Select Operator
+                      Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                       Group By Operator
                         keys: null (type: void)
                         mode: hash
                         outputColumnNames: _col0
+                        Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: void)
                           sort order: +
                           Map-reduce partition columns: _col0 (type: void)
+                          Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
         Reducer 3 
             Reduce Operator Tree:
               Group By Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/5f8b7016/ql/src/test/results/clientpositive/vector_join30.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_join30.q.out b/ql/src/test/results/clientpositive/vector_join30.q.out
index 57f9aeb..cfe047d 100644
--- a/ql/src/test/results/clientpositive/vector_join30.q.out
+++ b/ql/src/test/results/clientpositive/vector_join30.q.out
@@ -99,10 +99,12 @@ STAGE PLANS:
                 0 _col0 (type: string)
                 1 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 275 Data size: 48400 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -159,10 +161,12 @@ STAGE PLANS:
                 0 _col0 (type: string)
                 1 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 275 Data size: 48400 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -347,10 +351,12 @@ STAGE PLANS:
                 0 _col0 (type: string)
                 1 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 550 Data size: 96800 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -554,10 +560,12 @@ STAGE PLANS:
                 0 _col0 (type: string)
                 1 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 550 Data size: 96800 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -788,10 +796,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 550 Data size: 96800 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -861,10 +871,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 550 Data size: 96800 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -912,10 +924,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 550 Data size: 96800 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -1167,10 +1181,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 1100 Data size: 193600 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -1240,10 +1256,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 1100 Data size: 193600 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -1487,10 +1505,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 1100 Data size: 193600 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -1756,10 +1776,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 1100 Data size: 193600 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table:
@@ -2025,10 +2047,12 @@ STAGE PLANS:
                 1 _col0 (type: string)
                 2 _col0 (type: string)
               outputColumnNames: _col2, _col3
+              Statistics: Num rows: 1100 Data size: 193600 Basic stats: COMPLETE Column stats: NONE
               Group By Operator
                 aggregations: sum(hash(_col2,_col3))
                 mode: hash
                 outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   table: