You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rh...@apache.org on 2013/11/18 20:29:27 UTC
svn commit: r1543120 [2/16] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ data/files/ ql/src/java/org/apache/hadoop/hive/ql/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org/a...

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,1255 @@
+package org.apache.hadoop.hive.ql.stats;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.StatsSetupConst;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.ql.util.JavaDataModel;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBinaryObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveVarcharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
+import org.apache.hadoop.io.BytesWritable;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+
+public class StatsUtils {
+
+  /**
+   * Collect table, partition and column level statistics
+   *
+   * @param conf
+   *          - hive configuration
+   * @param partList
+   *          - partition list
+   * @param table
+   *          - table
+   * @param tableScanOperator
+   *          - table scan operator
+   * @return statistics object
+   * @throws HiveException
+   */
+  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
+      Table table, TableScanOperator tableScanOperator) {
+
+    Statistics stats = new Statistics();
+
+    // column level statistics are required only for the columns that are needed
+    List<ColumnInfo> schema = tableScanOperator.getSchema().getSignature();
+    List<String> neededColumns = tableScanOperator.getNeededColumns();
+    String dbName = table.getDbName();
+    String tabName = table.getTableName();
+
+    if (!table.isPartitioned()) {
+      long nr = getNumRows(dbName, tabName);
+      long rds = getRawDataSize(dbName, tabName);
+      if (rds <= 0) {
+        rds = getTotalSize(dbName, tabName);
+
+        // if data size is still 0 then get file size
+        if (rds <= 0) {
+          rds = getFileSizeForTable(conf, table);
+        }
+      }
+
+      // if basic stats are not available then return
+      if (nr <= 0 && rds <= 0) {
+        stats.setBasicStatsState(Statistics.State.NONE);
+        return stats;
+      }
+
+      // if any basic stats is missing, mark it as partial stats
+      if (nr <= 0 || rds <= 0) {
+        stats.setBasicStatsState(Statistics.State.PARTIAL);
+      }
+
+      // if both are available then we have complete basic stats
+      if (nr > 0 && rds > 0) {
+        stats.setBasicStatsState(Statistics.State.COMPLETE);
+      }
+
+      // number of rows -1 means that statistics from metastore is not reliable
+      if (nr <= 0) {
+        nr = 0;
+      }
+      stats.setNumRows(nr);
+      stats.setDataSize(rds);
+
+      List<ColStatistics> colStats = getTableColumnStats(table, schema, neededColumns);
+
+      // if column stats available and if atleast one column doesn't have stats
+      // then mark it as partial
+      if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) {
+        stats.setColumnStatsState(Statistics.State.PARTIAL);
+      }
+
+      // if column stats available and if all columns have stats then mark it
+      // as complete
+      if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) {
+        stats.setColumnStatsState(Statistics.State.COMPLETE);
+      }
+
+      if (!checkIfColStatsAvailable(colStats)) {
+        // if there is column projection and if we do not have stats then mark
+        // it as NONE. Else we will have stats for const/udf columns
+        if (!neededColumns.isEmpty()) {
+          stats.setColumnStatsState(Statistics.State.NONE);
+        } else {
+          stats.setColumnStatsState(Statistics.State.COMPLETE);
+        }
+        stats.addToColumnStats(null);
+      } else {
+        // set col stats and mark it as table level col stats
+        stats.addToColumnStats(colStats);
+      }
+    } else {
+
+      // For partitioned tables, get the size of all the partitions after pruning
+      // the partitions that are not required
+      if (partList != null) {
+        List<String> partNames = Lists.newArrayList();
+        for (Partition part : partList.getNotDeniedPartns()) {
+          partNames.add(part.getName());
+        }
+
+        List<Long> rowCounts = getBasicStatForPartitions(table, partNames,
+            StatsSetupConst.ROW_COUNT);
+        List<Long> dataSizes = getBasicStatForPartitions(table, partNames,
+            StatsSetupConst.RAW_DATA_SIZE);
+
+        long nr = getSumIgnoreNegatives(rowCounts);
+        long rds = getSumIgnoreNegatives(dataSizes);
+        if (rds <= 0) {
+          dataSizes = getBasicStatForPartitions(table, partNames, StatsSetupConst.TOTAL_SIZE);
+          rds = getSumIgnoreNegatives(dataSizes);
+
+          // if data size still could not be determined, then fall back to filesytem to get file
+          // sizes
+          if (rds <= 0) {
+            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
+          }
+          rds = getSumIgnoreNegatives(dataSizes);
+        }
+
+        // basic stats
+        if (nr <= 0 && rds <= 0) {
+          stats.updateBasicStatsState(Statistics.State.NONE);
+        } else if (nr <= 0 || rds <= 0) {
+          stats.updateBasicStatsState(Statistics.State.PARTIAL);
+        } else {
+          if (containsNonPositives(rowCounts) || containsNonPositives(dataSizes)) {
+            stats.updateBasicStatsState(Statistics.State.PARTIAL);
+          } else {
+            stats.updateBasicStatsState(Statistics.State.COMPLETE);
+          }
+        }
+
+        // number of rows -1 means that statistics from metastore is not reliable
+        if (nr <= 0) {
+          nr = 0;
+        }
+        stats.addToNumRows(nr);
+        stats.addToDataSize(rds);
+
+        // column stats
+        for (Partition part : partList.getNotDeniedPartns()) {
+          List<ColStatistics> colStats = getPartitionColumnStats(table, part, schema, neededColumns);
+          if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) {
+            stats.updateColumnStatsState(Statistics.State.PARTIAL);
+          } else if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) {
+            stats.updateColumnStatsState(Statistics.State.COMPLETE);
+          } else {
+            // if there is column projection and if we do not have stats then mark
+            // it as NONE. Else we will have stats for const/udf columns
+            if (!neededColumns.isEmpty()) {
+              stats.updateColumnStatsState(Statistics.State.NONE);
+            } else {
+              stats.updateColumnStatsState(Statistics.State.COMPLETE);
+            }
+          }
+          stats.addToColumnStats(colStats);
+        }
+      }
+    }
+
+    return stats;
+
+  }
+
+  /**
+   * Find the bytes on disk occupied by a table
+   *
+   * @param conf
+   *          - hive conf
+   * @param table
+   *          - table
+   * @return size on disk
+   */
+  public static long getFileSizeForTable(HiveConf conf, Table table) {
+    Path path = table.getPath();
+    long size = 0;
+    try {
+      FileSystem fs = path.getFileSystem(conf);
+      size = fs.getContentSummary(path).getLength();
+    } catch (Exception e) {
+      size = 0;
+    }
+    return size;
+  }
+
+  /**
+   * Find the bytes on disks occupied by list of partitions
+   *
+   * @param conf
+   *          - hive conf
+   * @param parts
+   *          - partition list
+   * @return sizes of patitions
+   */
+  public static List<Long> getFileSizeForPartitions(HiveConf conf, List<Partition> parts) {
+    List<Long> sizes = Lists.newArrayList();
+    for (Partition part : parts) {
+      Path path = part.getPartitionPath();
+      long size = 0;
+      try {
+        FileSystem fs = path.getFileSystem(conf);
+        size = fs.getContentSummary(path).getLength();
+      } catch (Exception e) {
+        size = 0;
+      }
+      sizes.add(size);
+    }
+    return sizes;
+  }
+
+  private static boolean containsNonPositives(List<Long> vals) {
+    for (Long val : vals) {
+      if (val <= 0L) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Get sum of all values in the list that are >0
+   *
+   * @param vals
+   *          - list of values
+   * @return sum
+   */
+  public static long getSumIgnoreNegatives(List<Long> vals) {
+    long result = 0;
+    for (Long l : vals) {
+      if (l > 0) {
+        result += l;
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Get the partition level columns statistics from metastore for all the needed columns
+   *
+   * @param table
+   *          - table object
+   * @param part
+   *          - partition object
+   * @param schema
+   *          - output schema
+   * @param neededColumns
+   *          - list of needed columns
+   * @return column statistics
+   */
+  public static List<ColStatistics> getPartitionColumnStats(Table table, Partition part,
+      List<ColumnInfo> schema, List<String> neededColumns) {
+
+    String dbName = table.getDbName();
+    String tabName = table.getTableName();
+    String partName = part.getName();
+    List<ColStatistics> colStatistics = Lists.newArrayList();
+    for (ColumnInfo col : schema) {
+      if (!col.isHiddenVirtualCol()) {
+        String colName = col.getInternalName();
+        if (neededColumns.contains(colName)) {
+          String tabAlias = col.getTabAlias();
+          ColStatistics cs = getParitionColumnStatsForColumn(dbName, tabName, partName, colName);
+          if (cs != null) {
+            cs.setTableAlias(tabAlias);
+          }
+          colStatistics.add(cs);
+        }
+      }
+    }
+    return colStatistics;
+  }
+
+  /**
+   * Get the partition level columns statistics from metastore for a specific column
+   *
+   * @param dbName
+   *          - database name
+   * @param tabName
+   *          - table name
+   * @param partName
+   *          - partition name
+   * @param colName
+   *          - column name
+   * @return column statistics
+   */
+  public static ColStatistics getParitionColumnStatsForColumn(String dbName, String tabName,
+      String partName, String colName) {
+    try {
+      ColumnStatistics colStats = Hive.get().getPartitionColumnStatistics(dbName, tabName,
+          partName, colName);
+      if (colStats != null) {
+        return getColStatistics(colStats.getStatsObj().get(0), tabName, colName);
+      }
+    } catch (HiveException e) {
+      return null;
+    }
+    return null;
+  }
+
+  /**
+   * Will return true if column statistics for atleast one column is available
+   *
+   * @param colStats
+   *          - column stats
+   * @return
+   */
+  private static boolean checkIfColStatsAvailable(List<ColStatistics> colStats) {
+    for (ColStatistics cs : colStats) {
+      if (cs != null) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Get table level column stats for specified column
+   *
+   * @param dbName
+   *          - database name
+   * @param tableName
+   *          - table name
+   * @param colName
+   *          - column name
+   * @return column stats
+   */
+  public static ColStatistics getTableColumnStatsForColumn(String dbName, String tableName,
+      String colName) {
+    try {
+      ColumnStatistics colStat = Hive.get().getTableColumnStatistics(dbName, tableName, colName);
+      if (colStat != null) {
+        // there will be only one column statistics object
+        return getColStatistics(colStat.getStatsObj().get(0), tableName, colName);
+      }
+    } catch (HiveException e) {
+      return null;
+    }
+    return null;
+  }
+
+  /**
+   * Convert ColumnStatisticsObj to ColStatistics
+   *
+   * @param cso
+   *          - ColumnStatisticsObj
+   * @param tabName
+   *          - table name
+   * @param colName
+   *          - column name
+   * @return ColStatistics
+   */
+  public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName,
+      String colName) {
+    ColStatistics cs = new ColStatistics(tabName, colName, cso.getColType());
+    String colType = cso.getColType();
+    ColumnStatisticsData csd = cso.getStatsData();
+    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) {
+      cs.setCountDistint(csd.getLongStats().getNumDVs());
+      cs.setNumNulls(csd.getLongStats().getNumNulls());
+      cs.setAvgColLen(JavaDataModel.get().primitive1());
+    } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+      cs.setCountDistint(csd.getLongStats().getNumDVs());
+      cs.setNumNulls(csd.getLongStats().getNumNulls());
+      cs.setAvgColLen(JavaDataModel.get().primitive2());
+    } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
+      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
+      cs.setAvgColLen(JavaDataModel.get().primitive1());
+    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
+      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
+      cs.setAvgColLen(JavaDataModel.get().primitive2());
+    } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+        || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
+        || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
+      cs.setCountDistint(csd.getStringStats().getNumDVs());
+      cs.setNumNulls(csd.getStringStats().getNumNulls());
+      cs.setAvgColLen(csd.getStringStats().getAvgColLen());
+    } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+      if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
+        cs.setCountDistint(2);
+      } else {
+        cs.setCountDistint(1);
+      }
+      cs.setNumTrues(csd.getBooleanStats().getNumTrues());
+      cs.setNumFalses(csd.getBooleanStats().getNumFalses());
+      cs.setNumNulls(csd.getBooleanStats().getNumNulls());
+      cs.setAvgColLen(JavaDataModel.get().primitive1());
+    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+      cs.setAvgColLen(csd.getBinaryStats().getAvgColLen());
+      cs.setNumNulls(csd.getBinaryStats().getNumNulls());
+    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
+    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+      cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
+    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+      cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
+    } else {
+      // Columns statistics for complex datatypes are not supported yet
+      return null;
+    }
+    return cs;
+  }
+
+  /**
+   * Get table level column statistics from metastore for needed columns
+   *
+   * @param table
+   *          - table
+   * @param schema
+   *          - output schema
+   * @param neededColumns
+   *          - list of needed columns
+   * @return column statistics
+   */
+  public static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema,
+      List<String> neededColumns) {
+
+    String dbName = table.getDbName();
+    String tabName = table.getTableName();
+    List<ColStatistics> colStatistics = Lists.newArrayList();
+    for (ColumnInfo col : schema) {
+      if (!col.isHiddenVirtualCol()) {
+        String colName = col.getInternalName();
+        if (neededColumns.contains(colName)) {
+          String tabAlias = col.getTabAlias();
+          ColStatistics cs = getTableColumnStatsForColumn(dbName, tabName, colName);
+          if (cs != null) {
+            cs.setTableAlias(tabAlias);
+          }
+          colStatistics.add(cs);
+        }
+      }
+    }
+    return colStatistics;
+  }
+
+  /**
+   * Get the raw data size of variable length data types
+   *
+   * @param conf
+   *          - hive conf
+   * @param oi
+   *          - object inspector
+   * @param colType
+   *          - column type
+   * @return raw data size
+   */
+  public static long getAvgColLenOfVariableLengthTypes(HiveConf conf, ObjectInspector oi,
+      String colType) {
+
+    long configVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);
+
+    if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) {
+
+      // constant string projection Ex: select "hello" from table
+      if (oi instanceof ConstantObjectInspector) {
+        ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+        // if writable constant is null then return size 0
+        if (coi.getWritableConstantValue() == null) {
+          return 0;
+        }
+
+        return coi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableConstantStringObjectInspector) {
+
+        // some UDFs return writable constant strings (fixed width)
+        // Ex: select upper("hello") from table
+        WritableConstantStringObjectInspector wcsoi = (WritableConstantStringObjectInspector) oi;
+
+        return wcsoi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableStringObjectInspector) {
+
+        // some UDFs may emit strings of variable length. like pattern matching
+        // UDFs. it's hard to find the length of such UDFs.
+        // return the variable length from config
+        return configVarLen;
+      }
+    } else if (colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
+
+      // constant varchar projection
+      if (oi instanceof ConstantObjectInspector) {
+        ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+        // if writable constant is null then return size 0
+        if (coi.getWritableConstantValue() == null) {
+          return 0;
+        }
+
+        return coi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableConstantHiveVarcharObjectInspector) {
+
+        WritableConstantHiveVarcharObjectInspector wcsoi = (WritableConstantHiveVarcharObjectInspector) oi;
+        return wcsoi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableHiveVarcharObjectInspector) {
+        return ((WritableHiveVarcharObjectInspector)oi).getMaxLength();
+      }
+    } else if (colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+
+      // constant char projection
+      if (oi instanceof ConstantObjectInspector) {
+        ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+        // if writable constant is null then return size 0
+        if (coi.getWritableConstantValue() == null) {
+          return 0;
+        }
+
+        return coi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableConstantHiveCharObjectInspector) {
+
+        WritableConstantHiveCharObjectInspector wcsoi = (WritableConstantHiveCharObjectInspector) oi;
+        return wcsoi.getWritableConstantValue().toString().length();
+      } else if (oi instanceof WritableHiveCharObjectInspector) {
+        return ((WritableHiveCharObjectInspector) oi).getMaxLength();
+      }
+    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+
+      // constant byte arrays
+      if (oi instanceof ConstantObjectInspector) {
+        ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+        // if writable constant is null then return size 0
+        if (coi.getWritableConstantValue() == null) {
+          return 0;
+        }
+
+        BytesWritable bw = ((BytesWritable) coi.getWritableConstantValue());
+        return bw.getLength();
+      } else if (oi instanceof WritableConstantBinaryObjectInspector) {
+
+        // writable constant byte arrays
+        WritableConstantBinaryObjectInspector wcboi = (WritableConstantBinaryObjectInspector) oi;
+
+        return wcboi.getWritableConstantValue().getLength();
+      } else if (oi instanceof WritableBinaryObjectInspector) {
+
+        // return the variable length from config
+        return configVarLen;
+      }
+    } else {
+
+      // complex types (map, list, struct, union)
+      return getSizeOfComplexTypes(conf, oi);
+    }
+
+    return 0;
+  }
+
+  /**
+   * Get the size of complex data types
+   *
+   * @param conf
+   *          - hive conf
+   * @param oi
+   *          - object inspector
+   * @return raw data size
+   */
+  public static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi) {
+    long result = 0;
+    int length = 0;
+    int listEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_LIST_NUM_ENTRIES);
+    int mapEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_NUM_ENTRIES);
+
+    switch (oi.getCategory()) {
+    case PRIMITIVE:
+      String colType = oi.getTypeName();
+      if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+          || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+          || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+        int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+        result += JavaDataModel.get().lengthForStringOfLength(avgColLen);
+      } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+        int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+        result += JavaDataModel.get().lengthForByteArrayOfSize(avgColLen);
+      } else {
+        result += getAvgColLenOfFixedLengthTypes(colType);
+      }
+      break;
+    case LIST:
+      if (oi instanceof StandardConstantListObjectInspector) {
+
+        // constant list projection of known length
+        StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi;
+        length = scloi.getWritableConstantValue().size();
+
+        // check if list elements are primitive or Objects
+        ObjectInspector leoi = scloi.getListElementObjectInspector();
+        if (leoi.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
+          result += getSizeOfPrimitiveTypeArraysFromType(leoi.getTypeName(), length);
+        } else {
+          result += JavaDataModel.get().lengthForObjectArrayOfSize(length);
+        }
+      } else {
+        StandardListObjectInspector sloi = (StandardListObjectInspector) oi;
+
+        // list overhead + (configured number of element in list * size of element)
+        long elemSize = getSizeOfComplexTypes(conf, sloi.getListElementObjectInspector());
+        result += JavaDataModel.get().arrayList() + (listEntries * elemSize);
+      }
+      break;
+    case MAP:
+      if (oi instanceof StandardConstantMapObjectInspector) {
+
+        // constant map projection of known length
+        StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi;
+        result += getSizeOfMap(scmoi);
+      } else {
+        StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi;
+        result += getSizeOfComplexTypes(conf, smoi.getMapKeyObjectInspector());
+        result += getSizeOfComplexTypes(conf, smoi.getMapValueObjectInspector());
+
+        // hash map overhead
+        result += JavaDataModel.get().hashMap(mapEntries);
+      }
+      break;
+    case STRUCT:
+      StructObjectInspector soi = (StructObjectInspector) oi;
+
+      // add constant object overhead for struct
+      result += JavaDataModel.get().object();
+
+      // add constant struct field names references overhead
+      result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
+      for (StructField field : soi.getAllStructFieldRefs()) {
+        result += getSizeOfComplexTypes(conf, field.getFieldObjectInspector());
+      }
+      break;
+    case UNION:
+      UnionObjectInspector uoi = (UnionObjectInspector) oi;
+
+      // add constant object overhead for union
+      result += JavaDataModel.get().object();
+
+      // add constant size for unions tags
+      result += uoi.getObjectInspectors().size() * JavaDataModel.get().primitive1();
+      for (ObjectInspector foi : uoi.getObjectInspectors()) {
+        result += getSizeOfComplexTypes(conf, foi);
+      }
+      break;
+    default:
+      break;
+    }
+
+    return result;
+  }
+
+  /**
+   * Get size of fixed length primitives
+   *
+   * @param colType
+   *          - column type
+   * @return raw data size
+   */
+  public static long getAvgColLenOfFixedLengthTypes(String colType) {
+    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+      return JavaDataModel.get().primitive1();
+    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+      return JavaDataModel.get().primitive2();
+    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+      return JavaDataModel.get().lengthOfTimestamp();
+    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+      return JavaDataModel.get().lengthOfDate();
+    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+      return JavaDataModel.get().lengthOfDecimal();
+    } else {
+      return 0;
+    }
+  }
+
+  /**
+   * Get the size of arrays of primitive types
+   *
+   * @param colType
+   *          - column type
+   * @param length
+   *          - array length
+   * @return raw data size
+   */
+  public static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length) {
+    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForIntArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForDoubleArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForLongArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForByteArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForBooleanArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForTimestampArrayOfSize(length);
+    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForDateArrayOfSize(length);
+    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+      return JavaDataModel.get().lengthForDecimalArrayOfSize(length);
+    } else {
+      return 0;
+    }
+  }
+
+  /**
+   * Estimate the size of map object
+   *
+   * @param scmoi
+   *          - object inspector
+   * @return size of map
+   */
+  public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) {
+    Map<?, ?> map = scmoi.getWritableConstantValue();
+    ObjectInspector koi = scmoi.getMapKeyObjectInspector();
+    ObjectInspector voi = scmoi.getMapValueObjectInspector();
+    long result = 0;
+    for (Map.Entry<?, ?> entry : map.entrySet()) {
+      result += getWritableSize(koi, entry.getKey());
+      result += getWritableSize(voi, entry.getValue());
+    }
+
+    // add additional overhead of each map entries
+    result += JavaDataModel.get().hashMap(map.entrySet().size());
+    return result;
+  }
+
+  /**
+   * Get size of primitive data types based on their respective writable object inspector
+   *
+   * @param oi
+   *          - object inspector
+   * @param value
+   *          - value
+   * @return raw data size
+   */
+  public static long getWritableSize(ObjectInspector oi, Object value) {
+    if (oi instanceof WritableStringObjectInspector) {
+      WritableStringObjectInspector woi = (WritableStringObjectInspector) oi;
+      return JavaDataModel.get().lengthForStringOfLength(
+          woi.getPrimitiveWritableObject(value).getLength());
+    } else if (oi instanceof WritableBinaryObjectInspector) {
+      WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi;
+      return JavaDataModel.get().lengthForByteArrayOfSize(
+          woi.getPrimitiveWritableObject(value).getLength());
+    } else if (oi instanceof WritableBooleanObjectInspector) {
+      return JavaDataModel.get().primitive1();
+    } else if (oi instanceof WritableByteObjectInspector) {
+      return JavaDataModel.get().primitive1();
+    } else if (oi instanceof WritableDateObjectInspector) {
+      return JavaDataModel.get().lengthOfDate();
+    } else if (oi instanceof WritableDoubleObjectInspector) {
+      return JavaDataModel.get().primitive2();
+    } else if (oi instanceof WritableFloatObjectInspector) {
+      return JavaDataModel.get().primitive1();
+    } else if (oi instanceof WritableHiveDecimalObjectInspector) {
+      return JavaDataModel.get().lengthOfDecimal();
+    } else if (oi instanceof WritableIntObjectInspector) {
+      return JavaDataModel.get().primitive1();
+    } else if (oi instanceof WritableLongObjectInspector) {
+      return JavaDataModel.get().primitive2();
+    } else if (oi instanceof WritableShortObjectInspector) {
+      return JavaDataModel.get().primitive1();
+    } else if (oi instanceof WritableTimestampObjectInspector) {
+      return JavaDataModel.get().lengthOfTimestamp();
+    }
+
+    return 0;
+  }
+
+  /**
+   * Get column statistics from parent statistics.
+   *
+   * @param conf
+   *          - hive conf
+   * @param parentStats
+   *          - parent statistics
+   * @param colExprMap
+   *          - column expression map
+   * @param rowSchema
+   *          - row schema
+   * @return column statistics
+   */
+  public static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf,
+      Statistics parentStats,
+      Map<String, ExprNodeDesc> colExprMap, RowSchema rowSchema) {
+
+    List<ColStatistics> cs = Lists.newArrayList();
+    if (colExprMap != null) {
+      for (ColumnInfo ci : rowSchema.getSignature()) {
+        String outColName = ci.getInternalName();
+        String outTabAlias = ci.getTabAlias();
+        ExprNodeDesc end = colExprMap.get(outColName);
+        if (end == null) {
+          outColName = StatsUtils.stripPrefixFromColumnName(outColName);
+          end = colExprMap.get(outColName);
+        }
+        ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
+        if (colStat != null) {
+          outColName = StatsUtils.stripPrefixFromColumnName(outColName);
+          colStat.setColumnName(outColName);
+          colStat.setTableAlias(outTabAlias);
+        }
+        cs.add(colStat);
+      }
+    }
+    return cs;
+  }
+
+  /**
+   * Get column statistics expression nodes
+   *
+   * @param conf
+   *          - hive conf
+   * @param parentStats
+   *          - parent statistics
+   * @param end
+   *          - expression nodes
+   * @return column statistics
+   */
+  public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats,
+      ExprNodeDesc end) {
+
+    if (end == null) {
+      return null;
+    }
+
+    String colName = null;
+    String colType = null;
+    double avgColSize = 0;
+    long countDistincts = 0;
+    long numNulls = 0;
+    ObjectInspector oi = null;
+    long numRows = parentStats.getNumRows();
+    String tabAlias = null;
+
+    if (end instanceof ExprNodeColumnDesc) {
+      // column projection
+      ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
+      colName = encd.getColumn();
+      tabAlias = encd.getTabAlias();
+      colName = stripPrefixFromColumnName(colName);
+
+      if (encd.getIsPartitionColOrVirtualCol()) {
+
+        // vitual columns
+        colType = encd.getTypeInfo().getTypeName();
+        countDistincts = numRows;
+        oi = encd.getWritableObjectInspector();
+      } else {
+
+        // clone the column stats and return
+        ColStatistics result = parentStats.getColumnStatisticsForColumn(tabAlias, colName);
+        if (result != null) {
+          try {
+            return result.clone();
+          } catch (CloneNotSupportedException e) {
+            return null;
+          }
+        }
+        return null;
+      }
+    } else if (end instanceof ExprNodeConstantDesc) {
+
+      // constant projection
+      ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
+
+      // null projection
+      if (encd.getValue() == null) {
+        colName = encd.getName();
+        colType = "null";
+        numNulls = numRows;
+      } else {
+        colName = encd.getName();
+        colType = encd.getTypeString();
+        countDistincts = 1;
+        oi = encd.getWritableObjectInspector();
+      }
+    } else if (end instanceof ExprNodeGenericFuncDesc) {
+
+      // udf projection
+      ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
+      colName = engfd.getName();
+      colType = engfd.getTypeString();
+      countDistincts = numRows;
+      oi = engfd.getWritableObjectInspector();
+    } else if (end instanceof ExprNodeNullDesc) {
+
+      // null projection
+      ExprNodeNullDesc ennd = (ExprNodeNullDesc) end;
+      colName = ennd.getName();
+      colType = "null";
+      numNulls = numRows;
+    }
+
+    if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+        || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
+        || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+        || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
+        || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
+        || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
+        || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
+        || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
+      avgColSize = getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+    } else {
+      avgColSize = getAvgColLenOfFixedLengthTypes(colType);
+    }
+
+    ColStatistics colStats = new ColStatistics(tabAlias, colName, colType);
+    colStats.setAvgColLen(avgColSize);
+    colStats.setCountDistint(countDistincts);
+    colStats.setNumNulls(numNulls);
+
+    return colStats;
+  }
+
+  /**
+   * Get number of rows of a give table
+   *
+   * @param dbName
+   *          - database name
+   * @param tabName
+   *          - table name
+   * @return number of rows
+   */
+  public static long getNumRows(String dbName, String tabName) {
+    return getBasicStatForTable(dbName, tabName, StatsSetupConst.ROW_COUNT);
+  }
+
+  /**
+   * Get raw data size of a give table
+   *
+   * @param dbName
+   *          - database name
+   * @param tabName
+   *          - table name
+   * @return raw data size
+   */
+  public static long getRawDataSize(String dbName, String tabName) {
+    return getBasicStatForTable(dbName, tabName, StatsSetupConst.RAW_DATA_SIZE);
+  }
+
+  /**
+   * Get total size of a give table
+   *
+   * @param dbName
+   *          - database name
+   * @param tabName
+   *          - table name
+   * @return total size
+   */
+  public static long getTotalSize(String dbName, String tabName) {
+    return getBasicStatForTable(dbName, tabName, StatsSetupConst.TOTAL_SIZE);
+  }
+
+  /**
+   * Get basic stats of table
+   *
+   * @param dbName
+   *          - database name
+   * @param tabName
+   *          - table name
+   * @param statType
+   *          - type of stats
+   * @return value of stats
+   */
+  public static long getBasicStatForTable(String dbName, String tabName, String statType) {
+
+    Table table;
+    try {
+      table = Hive.get().getTable(dbName, tabName);
+    } catch (HiveException e) {
+      return 0;
+    }
+
+    Map<String, String> params = table.getParameters();
+    long result = 0;
+
+    if (params != null) {
+      try {
+        result = Long.parseLong(params.get(statType));
+      } catch (NumberFormatException e) {
+        result = 0;
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Get basic stats of partitions
+   *
+   * @param table
+   *          - table
+   * @param partNames
+   *          - partition names
+   * @param statType
+   *          - type of stats
+   * @return value of stats
+   */
+  public static List<Long> getBasicStatForPartitions(Table table, List<String> partNames,
+      String statType) {
+
+    List<Long> stats = Lists.newArrayList();
+    List<Partition> parts;
+    try {
+      parts = Hive.get().getPartitionsByNames(table, partNames);
+    } catch (HiveException e1) {
+      return stats;
+    }
+
+    for (Partition part : parts) {
+      Map<String, String> params = part.getParameters();
+      long result = 0;
+      if (params != null) {
+        try {
+          result = Long.parseLong(params.get(statType));
+        } catch (NumberFormatException e) {
+          result = 0;
+        }
+        stats.add(result);
+      }
+    }
+    return stats;
+  }
+
+  /**
+   * Compute raw data size from column statistics
+   *
+   * @param numRows
+   *          - number of rows
+   * @param colStats
+   *          - column statistics
+   * @return raw data size
+   */
+  public static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats) {
+    long result = 0;
+
+    if (numRows <= 0) {
+      return result;
+    }
+
+    for (ColStatistics cs : colStats) {
+      if (cs != null) {
+        String colType = cs.getColumnType();
+        long nonNullCount = numRows - cs.getNumNulls();
+        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
+            || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+
+          result += nonNullCount * cs.getAvgColLen();
+        } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+            || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+            || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+
+          int acl = (int) Math.round(cs.getAvgColLen());
+          result += nonNullCount * JavaDataModel.get().lengthForStringOfLength(acl);
+        } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+
+          int acl = (int) Math.round(cs.getAvgColLen());
+          result += nonNullCount * JavaDataModel.get().lengthForByteArrayOfSize(acl);
+        } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+
+          result += nonNullCount * JavaDataModel.get().lengthOfTimestamp();
+        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+
+          result += nonNullCount * JavaDataModel.get().lengthOfDecimal();
+        } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+
+          result += nonNullCount * JavaDataModel.get().lengthOfDate();
+        } else {
+
+          result += nonNullCount * cs.getAvgColLen();
+        }
+      }
+    }
+
+    return result;
+  }
+
+  /**
+   * Remove KEY/VALUE prefix from column name
+   *
+   * @param colName
+   *          - column name
+   * @return column name
+   */
+  public static String stripPrefixFromColumnName(String colName) {
+    String stripedName = colName;
+    if (colName.startsWith("KEY._") || colName.startsWith("VALUE._")) {
+      // strip off KEY./VALUE. from column name
+      stripedName = colName.split("\\.")[1];
+    }
+    return stripedName;
+  }
+
+  /**
+   * Returns fully qualified name of column
+   *
+   * @param tabName
+   * @param colName
+   * @return
+   */
+  public static String getFullyQualifiedColumnName(String tabName, String colName) {
+    return getFullyQualifiedName(null, tabName, colName);
+  }
+
+  /**
+   * Returns fully qualified name of column
+   *
+   * @param dbName
+   * @param tabName
+   * @param colName
+   * @return
+   */
+  public static String getFullyQualifiedColumnName(String dbName, String tabName, String colName) {
+    return getFullyQualifiedName(dbName, tabName, colName);
+  }
+
+  /**
+   * Returns fully qualified name of column
+   *
+   * @param dbName
+   * @param tabName
+   * @param partName
+   * @param colName
+   * @return
+   */
+  public static String getFullyQualifiedColumnName(String dbName, String tabName, String partName,
+      String colName) {
+    return getFullyQualifiedName(dbName, tabName, partName, colName);
+  }
+
+  private static String getFullyQualifiedName(String... names) {
+    List<String> nonNullAndEmptyNames = Lists.newArrayList();
+    for (String name : names) {
+      if (name != null && !name.isEmpty()) {
+        nonNullAndEmptyNames.add(name);
+      }
+    }
+    return Joiner.on(".").join(nonNullAndEmptyNames);
+  }
+
+  /**
+   * Try to get fully qualified column name from expression node
+   *
+   * @param keyExprs
+   *          - expression nodes
+   * @param map
+   *          - column expression map
+   * @return list of fully qualified names
+   */
+  public static List<String> getFullQualifedColNameFromExprs(List<ExprNodeDesc> keyExprs,
+      Map<String, ExprNodeDesc> map) {
+    List<String> result = Lists.newArrayList();
+    if (keyExprs != null) {
+      for (ExprNodeDesc end : keyExprs) {
+        String outColName = null;
+        for (Map.Entry<String, ExprNodeDesc> entry : map.entrySet()) {
+          if (entry.getValue().isSame(end)) {
+            outColName = entry.getKey();
+          }
+        }
+        if (end instanceof ExprNodeColumnDesc) {
+          ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
+          if (outColName == null) {
+            outColName = encd.getColumn();
+          }
+          String tabAlias = encd.getTabAlias();
+          outColName = stripPrefixFromColumnName(outColName);
+          result.add(getFullyQualifiedColumnName(tabAlias, outColName));
+        } else if (end instanceof ExprNodeGenericFuncDesc) {
+          ExprNodeGenericFuncDesc enf = (ExprNodeGenericFuncDesc) end;
+          List<String> cols = getFullQualifedColNameFromExprs(enf.getChildren(), map);
+          String joinedStr = Joiner.on(".").skipNulls().join(cols);
+          result.add(joinedStr);
+        } else if (end instanceof ExprNodeConstantDesc) {
+          ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
+          result.add(encd.getValue().toString());
+        }
+      }
+    }
+    return result;
+  }
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java Mon Nov 18 19:29:24 2013
@@ -267,6 +267,15 @@ public enum JavaDataModel {
   public int lengthForBooleanArrayOfSize(int length) {
     return lengthForPrimitiveArrayOfSize(PRIMITIVE_BYTE, length);
   }
+  public int lengthForTimestampArrayOfSize(int length) {
+    return lengthForPrimitiveArrayOfSize(lengthOfTimestamp(), length);
+  }
+  public int lengthForDateArrayOfSize(int length) {
+    return lengthForPrimitiveArrayOfSize(lengthOfDate(), length);
+  }
+  public int lengthForDecimalArrayOfSize(int length) {
+    return lengthForPrimitiveArrayOfSize(lengthOfDecimal(), length);
+  }
 
   public int lengthOfDecimal() {
     // object overhead + 8 bytes for intCompact + 4 bytes for precision

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,74 @@
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- column stats are not COMPLETE, so stats are not updated
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc where state='OH';
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- state column has 5 distincts. numRows/countDistincts
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where state='OH';
+
+-- not equals comparison shouldn't affect number of rows. rawDataSize is 792 and not 796 because of rounding off issue with avgColLen. avgColLen uses integers and not double.
+-- numRows: 8 rawDataSize: 804
+explain extended select * from loc_orc where state!='OH';
+explain extended select * from loc_orc where state<>'OH';
+
+-- nulls are treated as constant equality comparison
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where zip is null;
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where !(zip is not null);
+
+-- not nulls are treated as inverse of nulls
+-- numRows: 7 rawDataSize: 702
+explain extended select * from loc_orc where zip is not null;
+-- numRows: 7 rawDataSize: 702
+explain extended select * from loc_orc where !(zip is null);
+
+-- NOT evaluation. true will pass all rows, false will not pass any rows
+-- numRows: 8 rawDataSize: 804
+explain extended select * from loc_orc where !false;
+-- numRows: 0 rawDataSize: 0
+explain extended select * from loc_orc where !true;
+
+-- OR evaluation. 1 row for OH and 1 row for CA
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where state='OH' or state='CA';
+
+-- AND evaluation. cascadingly apply rules. 8/2 = 4/2 = 2
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where year=2001 and year is null;
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where year=2001 and state='OH' and state='FL';
+
+-- AND and OR together. left expr will yield 1 row and right will yield 1 row
+-- numRows: 3 rawDataSize: 306
+explain extended select * from loc_orc where (year=2001 and year is null) or (state='CA');
+
+-- AND and OR together. left expr will yield 8 rows and right will yield 1 row
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where (year=2001 or year is null) and (state='CA');
+
+-- all inequality conditions rows/3 is the rules
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where locid < 30;
+explain extended select * from loc_orc where locid > 30;
+explain extended select * from loc_orc where locid <= 30;
+explain extended select * from loc_orc where locid >= 30;

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,55 @@
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- only one distinct value in year column + 1 NULL value
+-- map-side GBY: numRows: 8 (map-side will not do any reduction)
+-- reduce-side GBY: numRows: 2
+explain extended select year from loc_orc group by year;
+
+-- map-side GBY: numRows: 8
+-- reduce-side GBY: numRows: 4
+explain extended select state,locid from loc_orc group by state,locid;
+
+-- map-side GBY numRows: 32 reduce-side GBY numRows: 16
+explain extended select state,locid from loc_orc group by state,locid with cube;
+
+-- map-side GBY numRows: 24 reduce-side GBY numRows: 12
+explain extended select state,locid from loc_orc group by state,locid with rollup;
+
+-- map-side GBY numRows: 8 reduce-side GBY numRows: 4
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state));
+
+-- map-side GBY numRows: 16 reduce-side GBY numRows: 8
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state),(locid));
+
+-- map-side GBY numRows: 24 reduce-side GBY numRows: 12
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state),(locid),());
+
+-- map-side GBY numRows: 32 reduce-side GBY numRows: 16
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),());
+
+set hive.stats.map.parallelism=10;
+
+-- map-side GBY: numRows: 80 (map-side will not do any reduction)
+-- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2)
+explain extended select year from loc_orc group by year;
+
+-- map-side GBY numRows: 320 reduce-side GBY numRows: 42 Reason: numDistinct of state and locid are 6,7 resp. numRows = min(320/2, 6*7)
+explain extended select state,locid from loc_orc group by state,locid with cube;
+

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,80 @@
+create table if not exists emp_staging (
+  lastname string,
+  deptid int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists dept_staging (
+  deptid int,
+  deptname string
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists emp_orc like emp_staging;
+alter table emp_orc set fileformat orc;
+
+create table if not exists dept_orc like dept_staging;
+alter table dept_orc set fileformat orc;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging;
+LOAD DATA LOCAL INPATH '../../data/files/dept.txt' OVERWRITE INTO TABLE dept_staging;
+LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging;
+
+
+insert overwrite table emp_orc select * from emp_staging;
+insert overwrite table dept_orc select * from dept_staging;
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table emp_orc compute statistics for columns lastname,deptid;
+analyze table dept_orc compute statistics for columns deptname,deptid;
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- number of rows
+-- emp_orc  - 6
+-- dept_orc - 4
+-- loc_orc  - 8
+
+-- count distincts for relevant columns (since count distinct values are approximate in some cases count distint values will be greater than number of rows)
+-- emp_orc.deptid - 3
+-- emp_orc.lastname - 7
+-- dept_orc.deptid - 6
+-- dept_orc.deptname - 5
+-- loc_orc.locid - 6
+-- loc_orc.state - 7
+
+-- Expected output rows: 4
+-- Reason: #rows = (6*4)/max(3,6)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid);
+
+-- 3 way join
+-- Expected output rows: 4
+-- Reason: #rows = (6*4*6)/max(3,6)*max(6,3)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid) join emp_orc e1 on (e.deptid = e1.deptid);
+
+-- Expected output rows: 5
+-- Reason: #rows = (6*4*8)/max(3,6)*max(6,6)
+explain extended select * from emp_orc e join dept_orc d  on (e.deptid = d.deptid) join loc_orc l on (e.deptid = l.locid);
+
+-- join keys of different types
+-- Expected output rows: 4
+-- Reason: #rows = (6*4*8)/max(3,6)*max(6,7)
+explain extended select * from emp_orc e join dept_orc d  on (e.deptid = d.deptid) join loc_orc l on (e.deptid = l.state);
+
+-- multi-attribute join
+-- Expected output rows: 0
+-- Reason: #rows = (6*4)/max(3,6)*max(7,5)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid and e.lastname = d.deptname);
+
+-- 3 way and multi-attribute join
+-- Expected output rows: 0
+-- Reason: #rows = (6*4*8)/max(3,6)*max(7,5)*max(3,6)*max(7,7)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid and e.lastname = d.deptname) join loc_orc l on (e.deptid = l.locid and e.lastname = l.state);
+

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,28 @@
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_orc compute statistics for columns state, locid, zip, year;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- numRows: 4 rawDataSize: 396
+explain extended select * from loc_orc limit 4;
+
+-- greater than the available number of rows
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc limit 16;
+
+-- numRows: 0 rawDataSize: 0
+explain extended select * from loc_orc limit 0;

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,78 @@
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging;
+
+create table if not exists loc_orc (
+  state string,
+  locid int,
+  zip bigint
+) partitioned by(year int) stored as orc;
+
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from loc_orc;
+
+set hive.stats.autogather=false;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+
+insert overwrite table loc_orc partition(year) select * from loc_staging;
+
+-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc;
+
+-- partition level analyze statistics for specific parition
+analyze table loc_orc partition(year=2001) compute statistics;
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__';
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year=2001;
+
+-- partition level analyze statistics for all partitions
+analyze table loc_orc partition(year) compute statistics;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__';
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year=2001 or year='__HIVE_DEFAULT_PARTITION__';
+
+-- both partitions will be pruned
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from loc_orc where year=2001 and year='__HIVE_DEFAULT_PARTITION__';
+
+-- partition level partial column statistics
+analyze table loc_orc partition(year=2001) compute statistics for columns state,locid;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select zip from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select state from loc_orc;
+
+-- column statistics for __HIVE_DEFAULT_PARTITION__ is not supported yet. Hence colStatState reports PARTIAL
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select state,locid from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select state,locid from loc_orc where year=2001;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select state,locid from loc_orc where year!=2001;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select * from loc_orc;

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,140 @@
+create table if not exists alltypes (
+ bo1 boolean,
+ ti1 tinyint,
+ si1 smallint,
+ i1 int,
+ bi1 bigint,
+ f1 float,
+ d1 double,
+ de1 decimal,
+ ts1 timestamp,
+ da1 timestamp,
+ s1 string,
+ m1 map<string, string>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:string>
+) row format delimited fields terminated by '|'
+collection items terminated by ','
+map keys terminated by ':' stored as textfile;
+
+create table alltypes_orc like alltypes;
+alter table alltypes_orc set fileformat orc;
+
+load data local inpath '../../data/files/alltypes.txt' overwrite into table alltypes;
+
+insert overwrite table alltypes_orc select * from alltypes;
+
+-- basicStatState: COMPLETE colStatState: NONE numRows: 2 rawDataSize: 1514
+explain extended select * from alltypes_orc;
+
+-- statistics for complex types are not supported yet
+analyze table alltypes_orc compute statistics for columns bo1, ti1, si1, i1, bi1, f1, d1,s1;
+
+-- numRows: 2 rawDataSize: 1514
+explain extended select * from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 8
+explain extended select bo1 from alltypes_orc;
+
+-- col alias renaming
+-- numRows: 2 rawDataSize: 8
+explain extended select i1 as int1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 174
+explain extended select s1 from alltypes_orc;
+
+-- column statistics for complex types unsupported and so statistics will not be updated
+-- numRows: 2 rawDataSize: 1514
+explain extended select m1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 246
+explain extended select bo1, ti1, si1, i1, bi1, f1, d1,s1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 0
+explain extended select null from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 8
+explain extended select 11 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select 11L from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select 11.0 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 178
+explain extended select "hello" from alltypes_orc;
+explain extended select cast("hello" as char(5)) from alltypes_orc;
+explain extended select cast("hello" as varchar(5)) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 96
+explain extended select unbase64("0xe23") from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select cast("1" as TINYINT), cast("20" as SMALLINT) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 80
+explain extended select cast("1970-12-31 15:59:58.174" as TIMESTAMP) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select cast("1970-12-31 15:59:58.174" as DATE) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 224
+explain extended select cast("58.174" as DECIMAL) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select array(1,2,3) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 1508
+explain extended select str_to_map("a=1 b=2 c=3", " ", "=") from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select NAMED_STRUCT("a", 11, "b", 11) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 250
+explain extended select CREATE_UNION(0, "hello") from alltypes_orc;
+
+-- COUNT(*) is projected as new column. It is not projected as GenericUDF and so datasize estimate will be based on number of rows
+-- numRows: 1 rawDataSize: 8
+explain extended select count(*) from alltypes_orc;
+
+-- COUNT(1) is projected as new column. It is not projected as GenericUDF and so datasize estimate will be based on number of rows
+-- numRows: 1 rawDataSize: 8
+explain extended select count(1) from alltypes_orc;
+
+-- column statistics for complex column types will be missing. data size will be calculated from available column statistics
+-- numRows: 2 rawDataSize: 254
+explain extended select *,11 from alltypes_orc;
+
+-- subquery selects
+-- inner select - numRows: 2 rawDataSize: 8
+-- outer select - numRows: 2 rawDataSize: 8
+explain extended select i1 from (select i1 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 16
+-- outer select - numRows: 2 rawDataSize: 8
+explain extended select i1 from (select i1,11 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 16
+-- outer select - numRows: 2 rawDataSize: 186
+explain extended select i1,"hello" from (select i1,11 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 24
+-- outer select - numRows: 2 rawDataSize: 16
+explain extended select x from (select i1,11.0 as x from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 104
+-- outer select - numRows: 2 rawDataSize: 186
+explain extended select x,"hello" from (select i1 as x, unbase64("0xe23") as ub from alltypes_orc limit 10) temp;
+
+-- inner select -  numRows: 2 rawDataSize: 186
+-- middle select - numRows: 2 rawDataSize: 178
+-- outer select -  numRows: 2 rawDataSize: 194
+explain extended select h, 11.0 from (select hell as h from (select i1, "hello" as hell from alltypes_orc limit 10) in1 limit 10) in2;
+
+-- This test is for FILTER operator where filter expression is a boolean column
+-- numRows: 2 rawDataSize: 8
+explain extended select bo1 from alltypes_orc where bo1;
+
+-- numRows: 0 rawDataSize: 0
+explain extended select bo1 from alltypes_orc where !bo1;

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,52 @@
+create table if not exists emp_staging (
+  lastname string,
+  deptid int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists emp_orc like emp_staging;
+alter table emp_orc set fileformat orc;
+
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from emp_orc;
+
+LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging;
+
+set hive.stats.autogather=false;
+
+insert overwrite table emp_orc select * from emp_staging;
+
+-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from emp_orc;
+
+-- table level analyze statistics
+analyze table emp_orc compute statistics;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from emp_orc;
+
+-- column level partial statistics
+analyze table emp_orc compute statistics for columns deptid;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select * from emp_orc;
+
+-- all selected columns have statistics
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select deptid from emp_orc;
+
+-- column level complete statistics
+analyze table emp_orc compute statistics for columns lastname,deptid;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select * from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select lastname from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select deptid from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select lastname,deptid from emp_orc;

Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,53 @@
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- numRows: 8 rawDataSize: 688
+explain extended select state from loc_orc;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from loc_orc union all select state from loc_orc) tmp;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- numRows: 16 rawDataSize: 1592
+explain extended select * from (select * from loc_orc union all select * from loc_orc) tmp;
+
+create database test;
+use test;
+create table if not exists loc_staging (
+  state string,
+  locid int,
+  zip bigint,
+  year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_staging compute statistics;
+analyze table loc_staging compute statistics for columns state,locid,zip,year;
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from default.loc_orc union all select state from test.loc_orc) temp;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from test.loc_staging union all select state from test.loc_orc) temp;

Modified: hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out Mon Nov 18 19:29:24 2013
@@ -112,16 +112,24 @@ STAGE PLANS:
         alter_coltype 
           TableScan
             alias: alter_coltype
+            Statistics:
+                numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
             GatherStats: false
             Select Operator
+              Statistics:
+                  numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
               Group By Operator
                 aggregations:
                       expr: count()
                 bucketGroup: false
                 mode: hash
                 outputColumnNames: _col0
+                Statistics:
+                    numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                 Reduce Output Operator
                   sort order: 
+                  Statistics:
+                      numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                   tag: -1
                   value expressions:
                         expr: _col0
@@ -181,16 +189,22 @@ STAGE PLANS:
           bucketGroup: false
           mode: mergepartial
           outputColumnNames: _col0
+          Statistics:
+              numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
           Select Operator
             expressions:
                   expr: _col0
                   type: bigint
             outputColumnNames: _col0
+            Statistics:
+                numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
             File Output Operator
               compressed: false
               GlobalTableId: 0
 #### A masked pattern was here ####
               NumFilesPerFileSink: 1
+              Statistics:
+                  numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
 #### A masked pattern was here ####
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
@@ -308,16 +322,24 @@ STAGE PLANS:
         alter_coltype 
           TableScan
             alias: alter_coltype
+            Statistics:
+                numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
             GatherStats: false
             Select Operator
+              Statistics:
+                  numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
               Group By Operator
                 aggregations:
                       expr: count()
                 bucketGroup: false
                 mode: hash
                 outputColumnNames: _col0
+                Statistics:
+                    numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                 Reduce Output Operator
                   sort order: 
+                  Statistics:
+                      numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                   tag: -1
                   value expressions:
                         expr: _col0
@@ -377,16 +399,22 @@ STAGE PLANS:
           bucketGroup: false
           mode: mergepartial
           outputColumnNames: _col0
+          Statistics:
+              numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
           Select Operator
             expressions:
                   expr: _col0
                   type: bigint
             outputColumnNames: _col0
+            Statistics:
+                numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
             File Output Operator
               compressed: false
               GlobalTableId: 0
 #### A masked pattern was here ####
               NumFilesPerFileSink: 1
+              Statistics:
+                  numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
 #### A masked pattern was here ####
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
@@ -457,21 +485,31 @@ STAGE PLANS:
         alter_coltype 
           TableScan
             alias: alter_coltype
+            Statistics:
+                numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
             GatherStats: false
             Filter Operator
               isSamplingPred: false
               predicate:
                   expr: ((ts = 3.0) and (dt = 10))
                   type: boolean
+              Statistics:
+                  numRows: 75 dataSize: 0 basicStatsState: COMPLETE colStatsState: COMPLETE
               Select Operator
+                Statistics:
+                    numRows: 75 dataSize: 0 basicStatsState: COMPLETE colStatsState: COMPLETE
                 Group By Operator
                   aggregations:
                         expr: count()
                   bucketGroup: false
                   mode: hash
                   outputColumnNames: _col0
+                  Statistics:
+                      numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                   Reduce Output Operator
                     sort order: 
+                    Statistics:
+                        numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                     tag: -1
                     value expressions:
                           expr: _col0
@@ -617,16 +655,22 @@ STAGE PLANS:
           bucketGroup: false
           mode: mergepartial
           outputColumnNames: _col0
+          Statistics:
+              numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
           Select Operator
             expressions:
                   expr: _col0
                   type: bigint
             outputColumnNames: _col0
+            Statistics:
+                numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
             File Output Operator
               compressed: false
               GlobalTableId: 0
 #### A masked pattern was here ####
               NumFilesPerFileSink: 1
+              Statistics:
+                  numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
 #### A masked pattern was here ####
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
@@ -769,6 +813,8 @@ STAGE PLANS:
         alter_coltype 
           TableScan
             alias: alter_coltype
+            Statistics:
+                numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
             GatherStats: false
             Select Operator
               expressions:
@@ -781,11 +827,15 @@ STAGE PLANS:
                     expr: ts
                     type: string
               outputColumnNames: _col0, _col1, _col2, _col3
+              Statistics:
+                  numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
               File Output Operator
                 compressed: false
                 GlobalTableId: 0
 #### A masked pattern was here ####
                 NumFilesPerFileSink: 1
+                Statistics:
+                    numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
 #### A masked pattern was here ####
                 table:
                     input format: org.apache.hadoop.mapred.TextInputFormat
@@ -1008,16 +1058,24 @@ STAGE PLANS:
         alter_coltype 
           TableScan
             alias: alter_coltype
+            Statistics:
+                numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
             GatherStats: false
             Select Operator
+              Statistics:
+                  numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
               Group By Operator
                 aggregations:
                       expr: count()
                 bucketGroup: false
                 mode: hash
                 outputColumnNames: _col0
+                Statistics:
+                    numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                 Reduce Output Operator
                   sort order: 
+                  Statistics:
+                      numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
                   tag: -1
                   value expressions:
                         expr: _col0
@@ -1163,16 +1221,22 @@ STAGE PLANS:
           bucketGroup: false
           mode: mergepartial
           outputColumnNames: _col0
+          Statistics:
+              numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
           Select Operator
             expressions:
                   expr: _col0
                   type: bigint
             outputColumnNames: _col0
+            Statistics:
+                numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
             File Output Operator
               compressed: false
               GlobalTableId: 0
 #### A masked pattern was here ####
               NumFilesPerFileSink: 1
+              Statistics:
+                  numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
 #### A masked pattern was here ####
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat