You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rh...@apache.org on 2013/11/18 20:29:27 UTC
svn commit: r1543120 [2/16] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/ data/files/
ql/src/java/org/apache/hadoop/hive/ql/
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org/a...
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,1255 @@
+package org.apache.hadoop.hive.ql.stats;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.StatsSetupConst;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.ql.util.JavaDataModel;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBinaryObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveVarcharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
+import org.apache.hadoop.io.BytesWritable;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+
+public class StatsUtils {
+
+ /**
+ * Collect table, partition and column level statistics
+ *
+ * @param conf
+ * - hive configuration
+ * @param partList
+ * - partition list
+ * @param table
+ * - table
+ * @param tableScanOperator
+ * - table scan operator
+ * @return statistics object
+ * @throws HiveException
+ */
+ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
+ Table table, TableScanOperator tableScanOperator) {
+
+ Statistics stats = new Statistics();
+
+ // column level statistics are required only for the columns that are needed
+ List<ColumnInfo> schema = tableScanOperator.getSchema().getSignature();
+ List<String> neededColumns = tableScanOperator.getNeededColumns();
+ String dbName = table.getDbName();
+ String tabName = table.getTableName();
+
+ if (!table.isPartitioned()) {
+ long nr = getNumRows(dbName, tabName);
+ long rds = getRawDataSize(dbName, tabName);
+ if (rds <= 0) {
+ rds = getTotalSize(dbName, tabName);
+
+ // if data size is still 0 then get file size
+ if (rds <= 0) {
+ rds = getFileSizeForTable(conf, table);
+ }
+ }
+
+ // if basic stats are not available then return
+ if (nr <= 0 && rds <= 0) {
+ stats.setBasicStatsState(Statistics.State.NONE);
+ return stats;
+ }
+
+ // if any basic stats is missing, mark it as partial stats
+ if (nr <= 0 || rds <= 0) {
+ stats.setBasicStatsState(Statistics.State.PARTIAL);
+ }
+
+ // if both are available then we have complete basic stats
+ if (nr > 0 && rds > 0) {
+ stats.setBasicStatsState(Statistics.State.COMPLETE);
+ }
+
+ // number of rows -1 means that statistics from metastore is not reliable
+ if (nr <= 0) {
+ nr = 0;
+ }
+ stats.setNumRows(nr);
+ stats.setDataSize(rds);
+
+ List<ColStatistics> colStats = getTableColumnStats(table, schema, neededColumns);
+
+ // if column stats available and if atleast one column doesn't have stats
+ // then mark it as partial
+ if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) {
+ stats.setColumnStatsState(Statistics.State.PARTIAL);
+ }
+
+ // if column stats available and if all columns have stats then mark it
+ // as complete
+ if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) {
+ stats.setColumnStatsState(Statistics.State.COMPLETE);
+ }
+
+ if (!checkIfColStatsAvailable(colStats)) {
+ // if there is column projection and if we do not have stats then mark
+ // it as NONE. Else we will have stats for const/udf columns
+ if (!neededColumns.isEmpty()) {
+ stats.setColumnStatsState(Statistics.State.NONE);
+ } else {
+ stats.setColumnStatsState(Statistics.State.COMPLETE);
+ }
+ stats.addToColumnStats(null);
+ } else {
+ // set col stats and mark it as table level col stats
+ stats.addToColumnStats(colStats);
+ }
+ } else {
+
+ // For partitioned tables, get the size of all the partitions after pruning
+ // the partitions that are not required
+ if (partList != null) {
+ List<String> partNames = Lists.newArrayList();
+ for (Partition part : partList.getNotDeniedPartns()) {
+ partNames.add(part.getName());
+ }
+
+ List<Long> rowCounts = getBasicStatForPartitions(table, partNames,
+ StatsSetupConst.ROW_COUNT);
+ List<Long> dataSizes = getBasicStatForPartitions(table, partNames,
+ StatsSetupConst.RAW_DATA_SIZE);
+
+ long nr = getSumIgnoreNegatives(rowCounts);
+ long rds = getSumIgnoreNegatives(dataSizes);
+ if (rds <= 0) {
+ dataSizes = getBasicStatForPartitions(table, partNames, StatsSetupConst.TOTAL_SIZE);
+ rds = getSumIgnoreNegatives(dataSizes);
+
+ // if data size still could not be determined, then fall back to filesytem to get file
+ // sizes
+ if (rds <= 0) {
+ dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
+ }
+ rds = getSumIgnoreNegatives(dataSizes);
+ }
+
+ // basic stats
+ if (nr <= 0 && rds <= 0) {
+ stats.updateBasicStatsState(Statistics.State.NONE);
+ } else if (nr <= 0 || rds <= 0) {
+ stats.updateBasicStatsState(Statistics.State.PARTIAL);
+ } else {
+ if (containsNonPositives(rowCounts) || containsNonPositives(dataSizes)) {
+ stats.updateBasicStatsState(Statistics.State.PARTIAL);
+ } else {
+ stats.updateBasicStatsState(Statistics.State.COMPLETE);
+ }
+ }
+
+ // number of rows -1 means that statistics from metastore is not reliable
+ if (nr <= 0) {
+ nr = 0;
+ }
+ stats.addToNumRows(nr);
+ stats.addToDataSize(rds);
+
+ // column stats
+ for (Partition part : partList.getNotDeniedPartns()) {
+ List<ColStatistics> colStats = getPartitionColumnStats(table, part, schema, neededColumns);
+ if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) {
+ stats.updateColumnStatsState(Statistics.State.PARTIAL);
+ } else if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) {
+ stats.updateColumnStatsState(Statistics.State.COMPLETE);
+ } else {
+ // if there is column projection and if we do not have stats then mark
+ // it as NONE. Else we will have stats for const/udf columns
+ if (!neededColumns.isEmpty()) {
+ stats.updateColumnStatsState(Statistics.State.NONE);
+ } else {
+ stats.updateColumnStatsState(Statistics.State.COMPLETE);
+ }
+ }
+ stats.addToColumnStats(colStats);
+ }
+ }
+ }
+
+ return stats;
+
+ }
+
+ /**
+ * Find the bytes on disk occupied by a table
+ *
+ * @param conf
+ * - hive conf
+ * @param table
+ * - table
+ * @return size on disk
+ */
+ public static long getFileSizeForTable(HiveConf conf, Table table) {
+ Path path = table.getPath();
+ long size = 0;
+ try {
+ FileSystem fs = path.getFileSystem(conf);
+ size = fs.getContentSummary(path).getLength();
+ } catch (Exception e) {
+ size = 0;
+ }
+ return size;
+ }
+
+ /**
+ * Find the bytes on disks occupied by list of partitions
+ *
+ * @param conf
+ * - hive conf
+ * @param parts
+ * - partition list
+ * @return sizes of patitions
+ */
+ public static List<Long> getFileSizeForPartitions(HiveConf conf, List<Partition> parts) {
+ List<Long> sizes = Lists.newArrayList();
+ for (Partition part : parts) {
+ Path path = part.getPartitionPath();
+ long size = 0;
+ try {
+ FileSystem fs = path.getFileSystem(conf);
+ size = fs.getContentSummary(path).getLength();
+ } catch (Exception e) {
+ size = 0;
+ }
+ sizes.add(size);
+ }
+ return sizes;
+ }
+
+ private static boolean containsNonPositives(List<Long> vals) {
+ for (Long val : vals) {
+ if (val <= 0L) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Get sum of all values in the list that are >0
+ *
+ * @param vals
+ * - list of values
+ * @return sum
+ */
+ public static long getSumIgnoreNegatives(List<Long> vals) {
+ long result = 0;
+ for (Long l : vals) {
+ if (l > 0) {
+ result += l;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Get the partition level columns statistics from metastore for all the needed columns
+ *
+ * @param table
+ * - table object
+ * @param part
+ * - partition object
+ * @param schema
+ * - output schema
+ * @param neededColumns
+ * - list of needed columns
+ * @return column statistics
+ */
+ public static List<ColStatistics> getPartitionColumnStats(Table table, Partition part,
+ List<ColumnInfo> schema, List<String> neededColumns) {
+
+ String dbName = table.getDbName();
+ String tabName = table.getTableName();
+ String partName = part.getName();
+ List<ColStatistics> colStatistics = Lists.newArrayList();
+ for (ColumnInfo col : schema) {
+ if (!col.isHiddenVirtualCol()) {
+ String colName = col.getInternalName();
+ if (neededColumns.contains(colName)) {
+ String tabAlias = col.getTabAlias();
+ ColStatistics cs = getParitionColumnStatsForColumn(dbName, tabName, partName, colName);
+ if (cs != null) {
+ cs.setTableAlias(tabAlias);
+ }
+ colStatistics.add(cs);
+ }
+ }
+ }
+ return colStatistics;
+ }
+
+ /**
+ * Get the partition level columns statistics from metastore for a specific column
+ *
+ * @param dbName
+ * - database name
+ * @param tabName
+ * - table name
+ * @param partName
+ * - partition name
+ * @param colName
+ * - column name
+ * @return column statistics
+ */
+ public static ColStatistics getParitionColumnStatsForColumn(String dbName, String tabName,
+ String partName, String colName) {
+ try {
+ ColumnStatistics colStats = Hive.get().getPartitionColumnStatistics(dbName, tabName,
+ partName, colName);
+ if (colStats != null) {
+ return getColStatistics(colStats.getStatsObj().get(0), tabName, colName);
+ }
+ } catch (HiveException e) {
+ return null;
+ }
+ return null;
+ }
+
+ /**
+ * Will return true if column statistics for atleast one column is available
+ *
+ * @param colStats
+ * - column stats
+ * @return
+ */
+ private static boolean checkIfColStatsAvailable(List<ColStatistics> colStats) {
+ for (ColStatistics cs : colStats) {
+ if (cs != null) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Get table level column stats for specified column
+ *
+ * @param dbName
+ * - database name
+ * @param tableName
+ * - table name
+ * @param colName
+ * - column name
+ * @return column stats
+ */
+ public static ColStatistics getTableColumnStatsForColumn(String dbName, String tableName,
+ String colName) {
+ try {
+ ColumnStatistics colStat = Hive.get().getTableColumnStatistics(dbName, tableName, colName);
+ if (colStat != null) {
+ // there will be only one column statistics object
+ return getColStatistics(colStat.getStatsObj().get(0), tableName, colName);
+ }
+ } catch (HiveException e) {
+ return null;
+ }
+ return null;
+ }
+
+ /**
+ * Convert ColumnStatisticsObj to ColStatistics
+ *
+ * @param cso
+ * - ColumnStatisticsObj
+ * @param tabName
+ * - table name
+ * @param colName
+ * - column name
+ * @return ColStatistics
+ */
+ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName,
+ String colName) {
+ ColStatistics cs = new ColStatistics(tabName, colName, cso.getColType());
+ String colType = cso.getColType();
+ ColumnStatisticsData csd = cso.getStatsData();
+ if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) {
+ cs.setCountDistint(csd.getLongStats().getNumDVs());
+ cs.setNumNulls(csd.getLongStats().getNumNulls());
+ cs.setAvgColLen(JavaDataModel.get().primitive1());
+ } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+ cs.setCountDistint(csd.getLongStats().getNumDVs());
+ cs.setNumNulls(csd.getLongStats().getNumNulls());
+ cs.setAvgColLen(JavaDataModel.get().primitive2());
+ } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+ cs.setCountDistint(csd.getDoubleStats().getNumDVs());
+ cs.setNumNulls(csd.getDoubleStats().getNumNulls());
+ cs.setAvgColLen(JavaDataModel.get().primitive1());
+ } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+ cs.setCountDistint(csd.getDoubleStats().getNumDVs());
+ cs.setNumNulls(csd.getDoubleStats().getNumNulls());
+ cs.setAvgColLen(JavaDataModel.get().primitive2());
+ } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+ || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
+ || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
+ cs.setCountDistint(csd.getStringStats().getNumDVs());
+ cs.setNumNulls(csd.getStringStats().getNumNulls());
+ cs.setAvgColLen(csd.getStringStats().getAvgColLen());
+ } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+ if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
+ cs.setCountDistint(2);
+ } else {
+ cs.setCountDistint(1);
+ }
+ cs.setNumTrues(csd.getBooleanStats().getNumTrues());
+ cs.setNumFalses(csd.getBooleanStats().getNumFalses());
+ cs.setNumNulls(csd.getBooleanStats().getNumNulls());
+ cs.setAvgColLen(JavaDataModel.get().primitive1());
+ } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+ cs.setAvgColLen(csd.getBinaryStats().getAvgColLen());
+ cs.setNumNulls(csd.getBinaryStats().getNumNulls());
+ } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+ cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
+ } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+ cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
+ } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+ cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
+ } else {
+ // Columns statistics for complex datatypes are not supported yet
+ return null;
+ }
+ return cs;
+ }
+
+ /**
+ * Get table level column statistics from metastore for needed columns
+ *
+ * @param table
+ * - table
+ * @param schema
+ * - output schema
+ * @param neededColumns
+ * - list of needed columns
+ * @return column statistics
+ */
+ public static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema,
+ List<String> neededColumns) {
+
+ String dbName = table.getDbName();
+ String tabName = table.getTableName();
+ List<ColStatistics> colStatistics = Lists.newArrayList();
+ for (ColumnInfo col : schema) {
+ if (!col.isHiddenVirtualCol()) {
+ String colName = col.getInternalName();
+ if (neededColumns.contains(colName)) {
+ String tabAlias = col.getTabAlias();
+ ColStatistics cs = getTableColumnStatsForColumn(dbName, tabName, colName);
+ if (cs != null) {
+ cs.setTableAlias(tabAlias);
+ }
+ colStatistics.add(cs);
+ }
+ }
+ }
+ return colStatistics;
+ }
+
+ /**
+ * Get the raw data size of variable length data types
+ *
+ * @param conf
+ * - hive conf
+ * @param oi
+ * - object inspector
+ * @param colType
+ * - column type
+ * @return raw data size
+ */
+ public static long getAvgColLenOfVariableLengthTypes(HiveConf conf, ObjectInspector oi,
+ String colType) {
+
+ long configVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);
+
+ if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) {
+
+ // constant string projection Ex: select "hello" from table
+ if (oi instanceof ConstantObjectInspector) {
+ ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+ // if writable constant is null then return size 0
+ if (coi.getWritableConstantValue() == null) {
+ return 0;
+ }
+
+ return coi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableConstantStringObjectInspector) {
+
+ // some UDFs return writable constant strings (fixed width)
+ // Ex: select upper("hello") from table
+ WritableConstantStringObjectInspector wcsoi = (WritableConstantStringObjectInspector) oi;
+
+ return wcsoi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableStringObjectInspector) {
+
+ // some UDFs may emit strings of variable length. like pattern matching
+ // UDFs. it's hard to find the length of such UDFs.
+ // return the variable length from config
+ return configVarLen;
+ }
+ } else if (colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
+
+ // constant varchar projection
+ if (oi instanceof ConstantObjectInspector) {
+ ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+ // if writable constant is null then return size 0
+ if (coi.getWritableConstantValue() == null) {
+ return 0;
+ }
+
+ return coi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableConstantHiveVarcharObjectInspector) {
+
+ WritableConstantHiveVarcharObjectInspector wcsoi = (WritableConstantHiveVarcharObjectInspector) oi;
+ return wcsoi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableHiveVarcharObjectInspector) {
+ return ((WritableHiveVarcharObjectInspector)oi).getMaxLength();
+ }
+ } else if (colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+
+ // constant char projection
+ if (oi instanceof ConstantObjectInspector) {
+ ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+ // if writable constant is null then return size 0
+ if (coi.getWritableConstantValue() == null) {
+ return 0;
+ }
+
+ return coi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableConstantHiveCharObjectInspector) {
+
+ WritableConstantHiveCharObjectInspector wcsoi = (WritableConstantHiveCharObjectInspector) oi;
+ return wcsoi.getWritableConstantValue().toString().length();
+ } else if (oi instanceof WritableHiveCharObjectInspector) {
+ return ((WritableHiveCharObjectInspector) oi).getMaxLength();
+ }
+ } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+
+ // constant byte arrays
+ if (oi instanceof ConstantObjectInspector) {
+ ConstantObjectInspector coi = (ConstantObjectInspector) oi;
+
+ // if writable constant is null then return size 0
+ if (coi.getWritableConstantValue() == null) {
+ return 0;
+ }
+
+ BytesWritable bw = ((BytesWritable) coi.getWritableConstantValue());
+ return bw.getLength();
+ } else if (oi instanceof WritableConstantBinaryObjectInspector) {
+
+ // writable constant byte arrays
+ WritableConstantBinaryObjectInspector wcboi = (WritableConstantBinaryObjectInspector) oi;
+
+ return wcboi.getWritableConstantValue().getLength();
+ } else if (oi instanceof WritableBinaryObjectInspector) {
+
+ // return the variable length from config
+ return configVarLen;
+ }
+ } else {
+
+ // complex types (map, list, struct, union)
+ return getSizeOfComplexTypes(conf, oi);
+ }
+
+ return 0;
+ }
+
+ /**
+ * Get the size of complex data types
+ *
+ * @param conf
+ * - hive conf
+ * @param oi
+ * - object inspector
+ * @return raw data size
+ */
+ public static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi) {
+ long result = 0;
+ int length = 0;
+ int listEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_LIST_NUM_ENTRIES);
+ int mapEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_NUM_ENTRIES);
+
+ switch (oi.getCategory()) {
+ case PRIMITIVE:
+ String colType = oi.getTypeName();
+ if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+ || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+ || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+ int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+ result += JavaDataModel.get().lengthForStringOfLength(avgColLen);
+ } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+ int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+ result += JavaDataModel.get().lengthForByteArrayOfSize(avgColLen);
+ } else {
+ result += getAvgColLenOfFixedLengthTypes(colType);
+ }
+ break;
+ case LIST:
+ if (oi instanceof StandardConstantListObjectInspector) {
+
+ // constant list projection of known length
+ StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi;
+ length = scloi.getWritableConstantValue().size();
+
+ // check if list elements are primitive or Objects
+ ObjectInspector leoi = scloi.getListElementObjectInspector();
+ if (leoi.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
+ result += getSizeOfPrimitiveTypeArraysFromType(leoi.getTypeName(), length);
+ } else {
+ result += JavaDataModel.get().lengthForObjectArrayOfSize(length);
+ }
+ } else {
+ StandardListObjectInspector sloi = (StandardListObjectInspector) oi;
+
+ // list overhead + (configured number of element in list * size of element)
+ long elemSize = getSizeOfComplexTypes(conf, sloi.getListElementObjectInspector());
+ result += JavaDataModel.get().arrayList() + (listEntries * elemSize);
+ }
+ break;
+ case MAP:
+ if (oi instanceof StandardConstantMapObjectInspector) {
+
+ // constant map projection of known length
+ StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi;
+ result += getSizeOfMap(scmoi);
+ } else {
+ StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi;
+ result += getSizeOfComplexTypes(conf, smoi.getMapKeyObjectInspector());
+ result += getSizeOfComplexTypes(conf, smoi.getMapValueObjectInspector());
+
+ // hash map overhead
+ result += JavaDataModel.get().hashMap(mapEntries);
+ }
+ break;
+ case STRUCT:
+ StructObjectInspector soi = (StructObjectInspector) oi;
+
+ // add constant object overhead for struct
+ result += JavaDataModel.get().object();
+
+ // add constant struct field names references overhead
+ result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
+ for (StructField field : soi.getAllStructFieldRefs()) {
+ result += getSizeOfComplexTypes(conf, field.getFieldObjectInspector());
+ }
+ break;
+ case UNION:
+ UnionObjectInspector uoi = (UnionObjectInspector) oi;
+
+ // add constant object overhead for union
+ result += JavaDataModel.get().object();
+
+ // add constant size for unions tags
+ result += uoi.getObjectInspectors().size() * JavaDataModel.get().primitive1();
+ for (ObjectInspector foi : uoi.getObjectInspectors()) {
+ result += getSizeOfComplexTypes(conf, foi);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return result;
+ }
+
+ /**
+ * Get size of fixed length primitives
+ *
+ * @param colType
+ * - column type
+ * @return raw data size
+ */
+ public static long getAvgColLenOfFixedLengthTypes(String colType) {
+ if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+ return JavaDataModel.get().primitive1();
+ } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+ return JavaDataModel.get().primitive2();
+ } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+ return JavaDataModel.get().lengthOfTimestamp();
+ } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+ return JavaDataModel.get().lengthOfDate();
+ } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+ return JavaDataModel.get().lengthOfDecimal();
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Get the size of arrays of primitive types
+ *
+ * @param colType
+ * - column type
+ * @param length
+ * - array length
+ * @return raw data size
+ */
+ public static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length) {
+ if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForIntArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForDoubleArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForLongArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForByteArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForBooleanArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForTimestampArrayOfSize(length);
+ } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForDateArrayOfSize(length);
+ } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+ return JavaDataModel.get().lengthForDecimalArrayOfSize(length);
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Estimate the size of map object
+ *
+ * @param scmoi
+ * - object inspector
+ * @return size of map
+ */
+ public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) {
+ Map<?, ?> map = scmoi.getWritableConstantValue();
+ ObjectInspector koi = scmoi.getMapKeyObjectInspector();
+ ObjectInspector voi = scmoi.getMapValueObjectInspector();
+ long result = 0;
+ for (Map.Entry<?, ?> entry : map.entrySet()) {
+ result += getWritableSize(koi, entry.getKey());
+ result += getWritableSize(voi, entry.getValue());
+ }
+
+ // add additional overhead of each map entries
+ result += JavaDataModel.get().hashMap(map.entrySet().size());
+ return result;
+ }
+
+ /**
+ * Get size of primitive data types based on their respective writable object inspector
+ *
+ * @param oi
+ * - object inspector
+ * @param value
+ * - value
+ * @return raw data size
+ */
+ public static long getWritableSize(ObjectInspector oi, Object value) {
+ if (oi instanceof WritableStringObjectInspector) {
+ WritableStringObjectInspector woi = (WritableStringObjectInspector) oi;
+ return JavaDataModel.get().lengthForStringOfLength(
+ woi.getPrimitiveWritableObject(value).getLength());
+ } else if (oi instanceof WritableBinaryObjectInspector) {
+ WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi;
+ return JavaDataModel.get().lengthForByteArrayOfSize(
+ woi.getPrimitiveWritableObject(value).getLength());
+ } else if (oi instanceof WritableBooleanObjectInspector) {
+ return JavaDataModel.get().primitive1();
+ } else if (oi instanceof WritableByteObjectInspector) {
+ return JavaDataModel.get().primitive1();
+ } else if (oi instanceof WritableDateObjectInspector) {
+ return JavaDataModel.get().lengthOfDate();
+ } else if (oi instanceof WritableDoubleObjectInspector) {
+ return JavaDataModel.get().primitive2();
+ } else if (oi instanceof WritableFloatObjectInspector) {
+ return JavaDataModel.get().primitive1();
+ } else if (oi instanceof WritableHiveDecimalObjectInspector) {
+ return JavaDataModel.get().lengthOfDecimal();
+ } else if (oi instanceof WritableIntObjectInspector) {
+ return JavaDataModel.get().primitive1();
+ } else if (oi instanceof WritableLongObjectInspector) {
+ return JavaDataModel.get().primitive2();
+ } else if (oi instanceof WritableShortObjectInspector) {
+ return JavaDataModel.get().primitive1();
+ } else if (oi instanceof WritableTimestampObjectInspector) {
+ return JavaDataModel.get().lengthOfTimestamp();
+ }
+
+ return 0;
+ }
+
+ /**
+ * Get column statistics from parent statistics.
+ *
+ * @param conf
+ * - hive conf
+ * @param parentStats
+ * - parent statistics
+ * @param colExprMap
+ * - column expression map
+ * @param rowSchema
+ * - row schema
+ * @return column statistics
+ */
+ public static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf,
+ Statistics parentStats,
+ Map<String, ExprNodeDesc> colExprMap, RowSchema rowSchema) {
+
+ List<ColStatistics> cs = Lists.newArrayList();
+ if (colExprMap != null) {
+ for (ColumnInfo ci : rowSchema.getSignature()) {
+ String outColName = ci.getInternalName();
+ String outTabAlias = ci.getTabAlias();
+ ExprNodeDesc end = colExprMap.get(outColName);
+ if (end == null) {
+ outColName = StatsUtils.stripPrefixFromColumnName(outColName);
+ end = colExprMap.get(outColName);
+ }
+ ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
+ if (colStat != null) {
+ outColName = StatsUtils.stripPrefixFromColumnName(outColName);
+ colStat.setColumnName(outColName);
+ colStat.setTableAlias(outTabAlias);
+ }
+ cs.add(colStat);
+ }
+ }
+ return cs;
+ }
+
+ /**
+ * Get column statistics expression nodes
+ *
+ * @param conf
+ * - hive conf
+ * @param parentStats
+ * - parent statistics
+ * @param end
+ * - expression nodes
+ * @return column statistics
+ */
+ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats,
+ ExprNodeDesc end) {
+
+ if (end == null) {
+ return null;
+ }
+
+ String colName = null;
+ String colType = null;
+ double avgColSize = 0;
+ long countDistincts = 0;
+ long numNulls = 0;
+ ObjectInspector oi = null;
+ long numRows = parentStats.getNumRows();
+ String tabAlias = null;
+
+ if (end instanceof ExprNodeColumnDesc) {
+ // column projection
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
+ colName = encd.getColumn();
+ tabAlias = encd.getTabAlias();
+ colName = stripPrefixFromColumnName(colName);
+
+ if (encd.getIsPartitionColOrVirtualCol()) {
+
+ // vitual columns
+ colType = encd.getTypeInfo().getTypeName();
+ countDistincts = numRows;
+ oi = encd.getWritableObjectInspector();
+ } else {
+
+ // clone the column stats and return
+ ColStatistics result = parentStats.getColumnStatisticsForColumn(tabAlias, colName);
+ if (result != null) {
+ try {
+ return result.clone();
+ } catch (CloneNotSupportedException e) {
+ return null;
+ }
+ }
+ return null;
+ }
+ } else if (end instanceof ExprNodeConstantDesc) {
+
+ // constant projection
+ ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
+
+ // null projection
+ if (encd.getValue() == null) {
+ colName = encd.getName();
+ colType = "null";
+ numNulls = numRows;
+ } else {
+ colName = encd.getName();
+ colType = encd.getTypeString();
+ countDistincts = 1;
+ oi = encd.getWritableObjectInspector();
+ }
+ } else if (end instanceof ExprNodeGenericFuncDesc) {
+
+ // udf projection
+ ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
+ colName = engfd.getName();
+ colType = engfd.getTypeString();
+ countDistincts = numRows;
+ oi = engfd.getWritableObjectInspector();
+ } else if (end instanceof ExprNodeNullDesc) {
+
+ // null projection
+ ExprNodeNullDesc ennd = (ExprNodeNullDesc) end;
+ colName = ennd.getName();
+ colType = "null";
+ numNulls = numRows;
+ }
+
+ if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
+ || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+ || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
+ || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
+ || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
+ || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
+ || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
+ avgColSize = getAvgColLenOfVariableLengthTypes(conf, oi, colType);
+ } else {
+ avgColSize = getAvgColLenOfFixedLengthTypes(colType);
+ }
+
+ ColStatistics colStats = new ColStatistics(tabAlias, colName, colType);
+ colStats.setAvgColLen(avgColSize);
+ colStats.setCountDistint(countDistincts);
+ colStats.setNumNulls(numNulls);
+
+ return colStats;
+ }
+
+ /**
+ * Get number of rows of a give table
+ *
+ * @param dbName
+ * - database name
+ * @param tabName
+ * - table name
+ * @return number of rows
+ */
+ public static long getNumRows(String dbName, String tabName) {
+ return getBasicStatForTable(dbName, tabName, StatsSetupConst.ROW_COUNT);
+ }
+
+ /**
+ * Get raw data size of a give table
+ *
+ * @param dbName
+ * - database name
+ * @param tabName
+ * - table name
+ * @return raw data size
+ */
+ public static long getRawDataSize(String dbName, String tabName) {
+ return getBasicStatForTable(dbName, tabName, StatsSetupConst.RAW_DATA_SIZE);
+ }
+
+ /**
+ * Get total size of a give table
+ *
+ * @param dbName
+ * - database name
+ * @param tabName
+ * - table name
+ * @return total size
+ */
+ public static long getTotalSize(String dbName, String tabName) {
+ return getBasicStatForTable(dbName, tabName, StatsSetupConst.TOTAL_SIZE);
+ }
+
+ /**
+ * Get basic stats of table
+ *
+ * @param dbName
+ * - database name
+ * @param tabName
+ * - table name
+ * @param statType
+ * - type of stats
+ * @return value of stats
+ */
+ public static long getBasicStatForTable(String dbName, String tabName, String statType) {
+
+ Table table;
+ try {
+ table = Hive.get().getTable(dbName, tabName);
+ } catch (HiveException e) {
+ return 0;
+ }
+
+ Map<String, String> params = table.getParameters();
+ long result = 0;
+
+ if (params != null) {
+ try {
+ result = Long.parseLong(params.get(statType));
+ } catch (NumberFormatException e) {
+ result = 0;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Get basic stats of partitions
+ *
+ * @param table
+ * - table
+ * @param partNames
+ * - partition names
+ * @param statType
+ * - type of stats
+ * @return value of stats
+ */
+ public static List<Long> getBasicStatForPartitions(Table table, List<String> partNames,
+ String statType) {
+
+ List<Long> stats = Lists.newArrayList();
+ List<Partition> parts;
+ try {
+ parts = Hive.get().getPartitionsByNames(table, partNames);
+ } catch (HiveException e1) {
+ return stats;
+ }
+
+ for (Partition part : parts) {
+ Map<String, String> params = part.getParameters();
+ long result = 0;
+ if (params != null) {
+ try {
+ result = Long.parseLong(params.get(statType));
+ } catch (NumberFormatException e) {
+ result = 0;
+ }
+ stats.add(result);
+ }
+ }
+ return stats;
+ }
+
+ /**
+ * Compute raw data size from column statistics
+ *
+ * @param numRows
+ * - number of rows
+ * @param colStats
+ * - column statistics
+ * @return raw data size
+ */
+ public static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats) {
+ long result = 0;
+
+ if (numRows <= 0) {
+ return result;
+ }
+
+ for (ColStatistics cs : colStats) {
+ if (cs != null) {
+ String colType = cs.getColumnType();
+ long nonNullCount = numRows - cs.getNumNulls();
+ if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
+ || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
+
+ result += nonNullCount * cs.getAvgColLen();
+ } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
+ || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+ || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+
+ int acl = (int) Math.round(cs.getAvgColLen());
+ result += nonNullCount * JavaDataModel.get().lengthForStringOfLength(acl);
+ } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
+
+ int acl = (int) Math.round(cs.getAvgColLen());
+ result += nonNullCount * JavaDataModel.get().lengthForByteArrayOfSize(acl);
+ } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+
+ result += nonNullCount * JavaDataModel.get().lengthOfTimestamp();
+ } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+
+ result += nonNullCount * JavaDataModel.get().lengthOfDecimal();
+ } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
+
+ result += nonNullCount * JavaDataModel.get().lengthOfDate();
+ } else {
+
+ result += nonNullCount * cs.getAvgColLen();
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Remove KEY/VALUE prefix from column name
+ *
+ * @param colName
+ * - column name
+ * @return column name
+ */
+ public static String stripPrefixFromColumnName(String colName) {
+ String stripedName = colName;
+ if (colName.startsWith("KEY._") || colName.startsWith("VALUE._")) {
+ // strip off KEY./VALUE. from column name
+ stripedName = colName.split("\\.")[1];
+ }
+ return stripedName;
+ }
+
+ /**
+ * Returns fully qualified name of column
+ *
+ * @param tabName
+ * @param colName
+ * @return
+ */
+ public static String getFullyQualifiedColumnName(String tabName, String colName) {
+ return getFullyQualifiedName(null, tabName, colName);
+ }
+
+ /**
+ * Returns fully qualified name of column
+ *
+ * @param dbName
+ * @param tabName
+ * @param colName
+ * @return
+ */
+ public static String getFullyQualifiedColumnName(String dbName, String tabName, String colName) {
+ return getFullyQualifiedName(dbName, tabName, colName);
+ }
+
+ /**
+ * Returns fully qualified name of column
+ *
+ * @param dbName
+ * @param tabName
+ * @param partName
+ * @param colName
+ * @return
+ */
+ public static String getFullyQualifiedColumnName(String dbName, String tabName, String partName,
+ String colName) {
+ return getFullyQualifiedName(dbName, tabName, partName, colName);
+ }
+
+ private static String getFullyQualifiedName(String... names) {
+ List<String> nonNullAndEmptyNames = Lists.newArrayList();
+ for (String name : names) {
+ if (name != null && !name.isEmpty()) {
+ nonNullAndEmptyNames.add(name);
+ }
+ }
+ return Joiner.on(".").join(nonNullAndEmptyNames);
+ }
+
+ /**
+ * Try to get fully qualified column name from expression node
+ *
+ * @param keyExprs
+ * - expression nodes
+ * @param map
+ * - column expression map
+ * @return list of fully qualified names
+ */
+ public static List<String> getFullQualifedColNameFromExprs(List<ExprNodeDesc> keyExprs,
+ Map<String, ExprNodeDesc> map) {
+ List<String> result = Lists.newArrayList();
+ if (keyExprs != null) {
+ for (ExprNodeDesc end : keyExprs) {
+ String outColName = null;
+ for (Map.Entry<String, ExprNodeDesc> entry : map.entrySet()) {
+ if (entry.getValue().isSame(end)) {
+ outColName = entry.getKey();
+ }
+ }
+ if (end instanceof ExprNodeColumnDesc) {
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
+ if (outColName == null) {
+ outColName = encd.getColumn();
+ }
+ String tabAlias = encd.getTabAlias();
+ outColName = stripPrefixFromColumnName(outColName);
+ result.add(getFullyQualifiedColumnName(tabAlias, outColName));
+ } else if (end instanceof ExprNodeGenericFuncDesc) {
+ ExprNodeGenericFuncDesc enf = (ExprNodeGenericFuncDesc) end;
+ List<String> cols = getFullQualifedColNameFromExprs(enf.getChildren(), map);
+ String joinedStr = Joiner.on(".").skipNulls().join(cols);
+ result.add(joinedStr);
+ } else if (end instanceof ExprNodeConstantDesc) {
+ ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
+ result.add(encd.getValue().toString());
+ }
+ }
+ }
+ return result;
+ }
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java Mon Nov 18 19:29:24 2013
@@ -267,6 +267,15 @@ public enum JavaDataModel {
public int lengthForBooleanArrayOfSize(int length) {
return lengthForPrimitiveArrayOfSize(PRIMITIVE_BYTE, length);
}
+ public int lengthForTimestampArrayOfSize(int length) {
+ return lengthForPrimitiveArrayOfSize(lengthOfTimestamp(), length);
+ }
+ public int lengthForDateArrayOfSize(int length) {
+ return lengthForPrimitiveArrayOfSize(lengthOfDate(), length);
+ }
+ public int lengthForDecimalArrayOfSize(int length) {
+ return lengthForPrimitiveArrayOfSize(lengthOfDecimal(), length);
+ }
public int lengthOfDecimal() {
// object overhead + 8 bytes for intCompact + 4 bytes for precision
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,74 @@
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- column stats are not COMPLETE, so stats are not updated
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc where state='OH';
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- state column has 5 distincts. numRows/countDistincts
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where state='OH';
+
+-- not equals comparison shouldn't affect number of rows. rawDataSize is 792 and not 796 because of rounding off issue with avgColLen. avgColLen uses integers and not double.
+-- numRows: 8 rawDataSize: 804
+explain extended select * from loc_orc where state!='OH';
+explain extended select * from loc_orc where state<>'OH';
+
+-- nulls are treated as constant equality comparison
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where zip is null;
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where !(zip is not null);
+
+-- not nulls are treated as inverse of nulls
+-- numRows: 7 rawDataSize: 702
+explain extended select * from loc_orc where zip is not null;
+-- numRows: 7 rawDataSize: 702
+explain extended select * from loc_orc where !(zip is null);
+
+-- NOT evaluation. true will pass all rows, false will not pass any rows
+-- numRows: 8 rawDataSize: 804
+explain extended select * from loc_orc where !false;
+-- numRows: 0 rawDataSize: 0
+explain extended select * from loc_orc where !true;
+
+-- OR evaluation. 1 row for OH and 1 row for CA
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where state='OH' or state='CA';
+
+-- AND evaluation. cascadingly apply rules. 8/2 = 4/2 = 2
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where year=2001 and year is null;
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where year=2001 and state='OH' and state='FL';
+
+-- AND and OR together. left expr will yield 1 row and right will yield 1 row
+-- numRows: 3 rawDataSize: 306
+explain extended select * from loc_orc where (year=2001 and year is null) or (state='CA');
+
+-- AND and OR together. left expr will yield 8 rows and right will yield 1 row
+-- numRows: 1 rawDataSize: 102
+explain extended select * from loc_orc where (year=2001 or year is null) and (state='CA');
+
+-- all inequality conditions rows/3 is the rules
+-- numRows: 2 rawDataSize: 204
+explain extended select * from loc_orc where locid < 30;
+explain extended select * from loc_orc where locid > 30;
+explain extended select * from loc_orc where locid <= 30;
+explain extended select * from loc_orc where locid >= 30;
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,55 @@
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- only one distinct value in year column + 1 NULL value
+-- map-side GBY: numRows: 8 (map-side will not do any reduction)
+-- reduce-side GBY: numRows: 2
+explain extended select year from loc_orc group by year;
+
+-- map-side GBY: numRows: 8
+-- reduce-side GBY: numRows: 4
+explain extended select state,locid from loc_orc group by state,locid;
+
+-- map-side GBY numRows: 32 reduce-side GBY numRows: 16
+explain extended select state,locid from loc_orc group by state,locid with cube;
+
+-- map-side GBY numRows: 24 reduce-side GBY numRows: 12
+explain extended select state,locid from loc_orc group by state,locid with rollup;
+
+-- map-side GBY numRows: 8 reduce-side GBY numRows: 4
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state));
+
+-- map-side GBY numRows: 16 reduce-side GBY numRows: 8
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state),(locid));
+
+-- map-side GBY numRows: 24 reduce-side GBY numRows: 12
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state),(locid),());
+
+-- map-side GBY numRows: 32 reduce-side GBY numRows: 16
+explain extended select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),());
+
+set hive.stats.map.parallelism=10;
+
+-- map-side GBY: numRows: 80 (map-side will not do any reduction)
+-- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2)
+explain extended select year from loc_orc group by year;
+
+-- map-side GBY numRows: 320 reduce-side GBY numRows: 42 Reason: numDistinct of state and locid are 6,7 resp. numRows = min(320/2, 6*7)
+explain extended select state,locid from loc_orc group by state,locid with cube;
+
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,80 @@
+create table if not exists emp_staging (
+ lastname string,
+ deptid int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists dept_staging (
+ deptid int,
+ deptname string
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists emp_orc like emp_staging;
+alter table emp_orc set fileformat orc;
+
+create table if not exists dept_orc like dept_staging;
+alter table dept_orc set fileformat orc;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging;
+LOAD DATA LOCAL INPATH '../../data/files/dept.txt' OVERWRITE INTO TABLE dept_staging;
+LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging;
+
+
+insert overwrite table emp_orc select * from emp_staging;
+insert overwrite table dept_orc select * from dept_staging;
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table emp_orc compute statistics for columns lastname,deptid;
+analyze table dept_orc compute statistics for columns deptname,deptid;
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- number of rows
+-- emp_orc - 6
+-- dept_orc - 4
+-- loc_orc - 8
+
+-- count distincts for relevant columns (since count distinct values are approximate in some cases count distint values will be greater than number of rows)
+-- emp_orc.deptid - 3
+-- emp_orc.lastname - 7
+-- dept_orc.deptid - 6
+-- dept_orc.deptname - 5
+-- loc_orc.locid - 6
+-- loc_orc.state - 7
+
+-- Expected output rows: 4
+-- Reason: #rows = (6*4)/max(3,6)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid);
+
+-- 3 way join
+-- Expected output rows: 4
+-- Reason: #rows = (6*4*6)/max(3,6)*max(6,3)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid) join emp_orc e1 on (e.deptid = e1.deptid);
+
+-- Expected output rows: 5
+-- Reason: #rows = (6*4*8)/max(3,6)*max(6,6)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid) join loc_orc l on (e.deptid = l.locid);
+
+-- join keys of different types
+-- Expected output rows: 4
+-- Reason: #rows = (6*4*8)/max(3,6)*max(6,7)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid) join loc_orc l on (e.deptid = l.state);
+
+-- multi-attribute join
+-- Expected output rows: 0
+-- Reason: #rows = (6*4)/max(3,6)*max(7,5)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid and e.lastname = d.deptname);
+
+-- 3 way and multi-attribute join
+-- Expected output rows: 0
+-- Reason: #rows = (6*4*8)/max(3,6)*max(7,5)*max(3,6)*max(7,7)
+explain extended select * from emp_orc e join dept_orc d on (e.deptid = d.deptid and e.lastname = d.deptname) join loc_orc l on (e.deptid = l.locid and e.lastname = l.state);
+
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,28 @@
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_orc compute statistics for columns state, locid, zip, year;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- numRows: 4 rawDataSize: 396
+explain extended select * from loc_orc limit 4;
+
+-- greater than the available number of rows
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc limit 16;
+
+-- numRows: 0 rawDataSize: 0
+explain extended select * from loc_orc limit 0;
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,78 @@
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging;
+
+create table if not exists loc_orc (
+ state string,
+ locid int,
+ zip bigint
+) partitioned by(year int) stored as orc;
+
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from loc_orc;
+
+set hive.stats.autogather=false;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+
+insert overwrite table loc_orc partition(year) select * from loc_staging;
+
+-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc;
+
+-- partition level analyze statistics for specific parition
+analyze table loc_orc partition(year=2001) compute statistics;
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__';
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year=2001;
+
+-- partition level analyze statistics for all partitions
+analyze table loc_orc partition(year) compute statistics;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__';
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from loc_orc where year=2001 or year='__HIVE_DEFAULT_PARTITION__';
+
+-- both partitions will be pruned
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from loc_orc where year=2001 and year='__HIVE_DEFAULT_PARTITION__';
+
+-- partition level partial column statistics
+analyze table loc_orc partition(year=2001) compute statistics for columns state,locid;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select zip from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select state from loc_orc;
+
+-- column statistics for __HIVE_DEFAULT_PARTITION__ is not supported yet. Hence colStatState reports PARTIAL
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select state,locid from loc_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select state,locid from loc_orc where year=2001;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select state,locid from loc_orc where year!=2001;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select * from loc_orc;
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,140 @@
+create table if not exists alltypes (
+ bo1 boolean,
+ ti1 tinyint,
+ si1 smallint,
+ i1 int,
+ bi1 bigint,
+ f1 float,
+ d1 double,
+ de1 decimal,
+ ts1 timestamp,
+ da1 timestamp,
+ s1 string,
+ m1 map<string, string>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:string>
+) row format delimited fields terminated by '|'
+collection items terminated by ','
+map keys terminated by ':' stored as textfile;
+
+create table alltypes_orc like alltypes;
+alter table alltypes_orc set fileformat orc;
+
+load data local inpath '../../data/files/alltypes.txt' overwrite into table alltypes;
+
+insert overwrite table alltypes_orc select * from alltypes;
+
+-- basicStatState: COMPLETE colStatState: NONE numRows: 2 rawDataSize: 1514
+explain extended select * from alltypes_orc;
+
+-- statistics for complex types are not supported yet
+analyze table alltypes_orc compute statistics for columns bo1, ti1, si1, i1, bi1, f1, d1,s1;
+
+-- numRows: 2 rawDataSize: 1514
+explain extended select * from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 8
+explain extended select bo1 from alltypes_orc;
+
+-- col alias renaming
+-- numRows: 2 rawDataSize: 8
+explain extended select i1 as int1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 174
+explain extended select s1 from alltypes_orc;
+
+-- column statistics for complex types unsupported and so statistics will not be updated
+-- numRows: 2 rawDataSize: 1514
+explain extended select m1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 246
+explain extended select bo1, ti1, si1, i1, bi1, f1, d1,s1 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 0
+explain extended select null from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 8
+explain extended select 11 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select 11L from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select 11.0 from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 178
+explain extended select "hello" from alltypes_orc;
+explain extended select cast("hello" as char(5)) from alltypes_orc;
+explain extended select cast("hello" as varchar(5)) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 96
+explain extended select unbase64("0xe23") from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 16
+explain extended select cast("1" as TINYINT), cast("20" as SMALLINT) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 80
+explain extended select cast("1970-12-31 15:59:58.174" as TIMESTAMP) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select cast("1970-12-31 15:59:58.174" as DATE) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 224
+explain extended select cast("58.174" as DECIMAL) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select array(1,2,3) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 1508
+explain extended select str_to_map("a=1 b=2 c=3", " ", "=") from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 112
+explain extended select NAMED_STRUCT("a", 11, "b", 11) from alltypes_orc;
+
+-- numRows: 2 rawDataSize: 250
+explain extended select CREATE_UNION(0, "hello") from alltypes_orc;
+
+-- COUNT(*) is projected as new column. It is not projected as GenericUDF and so datasize estimate will be based on number of rows
+-- numRows: 1 rawDataSize: 8
+explain extended select count(*) from alltypes_orc;
+
+-- COUNT(1) is projected as new column. It is not projected as GenericUDF and so datasize estimate will be based on number of rows
+-- numRows: 1 rawDataSize: 8
+explain extended select count(1) from alltypes_orc;
+
+-- column statistics for complex column types will be missing. data size will be calculated from available column statistics
+-- numRows: 2 rawDataSize: 254
+explain extended select *,11 from alltypes_orc;
+
+-- subquery selects
+-- inner select - numRows: 2 rawDataSize: 8
+-- outer select - numRows: 2 rawDataSize: 8
+explain extended select i1 from (select i1 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 16
+-- outer select - numRows: 2 rawDataSize: 8
+explain extended select i1 from (select i1,11 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 16
+-- outer select - numRows: 2 rawDataSize: 186
+explain extended select i1,"hello" from (select i1,11 from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 24
+-- outer select - numRows: 2 rawDataSize: 16
+explain extended select x from (select i1,11.0 as x from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 104
+-- outer select - numRows: 2 rawDataSize: 186
+explain extended select x,"hello" from (select i1 as x, unbase64("0xe23") as ub from alltypes_orc limit 10) temp;
+
+-- inner select - numRows: 2 rawDataSize: 186
+-- middle select - numRows: 2 rawDataSize: 178
+-- outer select - numRows: 2 rawDataSize: 194
+explain extended select h, 11.0 from (select hell as h from (select i1, "hello" as hell from alltypes_orc limit 10) in1 limit 10) in2;
+
+-- This test is for FILTER operator where filter expression is a boolean column
+-- numRows: 2 rawDataSize: 8
+explain extended select bo1 from alltypes_orc where bo1;
+
+-- numRows: 0 rawDataSize: 0
+explain extended select bo1 from alltypes_orc where !bo1;
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,52 @@
+create table if not exists emp_staging (
+ lastname string,
+ deptid int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table if not exists emp_orc like emp_staging;
+alter table emp_orc set fileformat orc;
+
+-- basicStatState: NONE colStatState: NONE
+explain extended select * from emp_orc;
+
+LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging;
+
+set hive.stats.autogather=false;
+
+insert overwrite table emp_orc select * from emp_staging;
+
+-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL
+
+-- basicStatState: PARTIAL colStatState: NONE
+explain extended select * from emp_orc;
+
+-- table level analyze statistics
+analyze table emp_orc compute statistics;
+
+-- basicStatState: COMPLETE colStatState: NONE
+explain extended select * from emp_orc;
+
+-- column level partial statistics
+analyze table emp_orc compute statistics for columns deptid;
+
+-- basicStatState: COMPLETE colStatState: PARTIAL
+explain extended select * from emp_orc;
+
+-- all selected columns have statistics
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select deptid from emp_orc;
+
+-- column level complete statistics
+analyze table emp_orc compute statistics for columns lastname,deptid;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select * from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select lastname from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select deptid from emp_orc;
+
+-- basicStatState: COMPLETE colStatState: COMPLETE
+explain extended select lastname,deptid from emp_orc;
Added: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q Mon Nov 18 19:29:24 2013
@@ -0,0 +1,53 @@
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- numRows: 8 rawDataSize: 688
+explain extended select state from loc_orc;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from loc_orc union all select state from loc_orc) tmp;
+
+-- numRows: 8 rawDataSize: 796
+explain extended select * from loc_orc;
+
+-- numRows: 16 rawDataSize: 1592
+explain extended select * from (select * from loc_orc union all select * from loc_orc) tmp;
+
+create database test;
+use test;
+create table if not exists loc_staging (
+ state string,
+ locid int,
+ zip bigint,
+ year int
+) row format delimited fields terminated by '|' stored as textfile;
+
+create table loc_orc like loc_staging;
+alter table loc_orc set fileformat orc;
+
+load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging;
+
+insert overwrite table loc_orc select * from loc_staging;
+
+analyze table loc_staging compute statistics;
+analyze table loc_staging compute statistics for columns state,locid,zip,year;
+analyze table loc_orc compute statistics for columns state,locid,zip,year;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from default.loc_orc union all select state from test.loc_orc) temp;
+
+-- numRows: 16 rawDataSize: 1376
+explain extended select * from (select state from test.loc_staging union all select state from test.loc_orc) temp;
Modified: hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out Mon Nov 18 19:29:24 2013
@@ -112,16 +112,24 @@ STAGE PLANS:
alter_coltype
TableScan
alias: alter_coltype
+ Statistics:
+ numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
GatherStats: false
Select Operator
+ Statistics:
+ numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
Group By Operator
aggregations:
expr: count()
bucketGroup: false
mode: hash
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Reduce Output Operator
sort order:
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
tag: -1
value expressions:
expr: _col0
@@ -181,16 +189,22 @@ STAGE PLANS:
bucketGroup: false
mode: mergepartial
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Select Operator
expressions:
expr: _col0
type: bigint
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -308,16 +322,24 @@ STAGE PLANS:
alter_coltype
TableScan
alias: alter_coltype
+ Statistics:
+ numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
GatherStats: false
Select Operator
+ Statistics:
+ numRows: 25 dataSize: 191 basicStatsState: COMPLETE colStatsState: COMPLETE
Group By Operator
aggregations:
expr: count()
bucketGroup: false
mode: hash
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Reduce Output Operator
sort order:
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
tag: -1
value expressions:
expr: _col0
@@ -377,16 +399,22 @@ STAGE PLANS:
bucketGroup: false
mode: mergepartial
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Select Operator
expressions:
expr: _col0
type: bigint
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -457,21 +485,31 @@ STAGE PLANS:
alter_coltype
TableScan
alias: alter_coltype
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate:
expr: ((ts = 3.0) and (dt = 10))
type: boolean
+ Statistics:
+ numRows: 75 dataSize: 0 basicStatsState: COMPLETE colStatsState: COMPLETE
Select Operator
+ Statistics:
+ numRows: 75 dataSize: 0 basicStatsState: COMPLETE colStatsState: COMPLETE
Group By Operator
aggregations:
expr: count()
bucketGroup: false
mode: hash
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Reduce Output Operator
sort order:
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
tag: -1
value expressions:
expr: _col0
@@ -617,16 +655,22 @@ STAGE PLANS:
bucketGroup: false
mode: mergepartial
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Select Operator
expressions:
expr: _col0
type: bigint
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -769,6 +813,8 @@ STAGE PLANS:
alter_coltype
TableScan
alias: alter_coltype
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
GatherStats: false
Select Operator
expressions:
@@ -781,11 +827,15 @@ STAGE PLANS:
expr: ts
type: string
outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: NONE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -1008,16 +1058,24 @@ STAGE PLANS:
alter_coltype
TableScan
alias: alter_coltype
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
GatherStats: false
Select Operator
+ Statistics:
+ numRows: 75 dataSize: 573 basicStatsState: COMPLETE colStatsState: COMPLETE
Group By Operator
aggregations:
expr: count()
bucketGroup: false
mode: hash
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Reduce Output Operator
sort order:
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
tag: -1
value expressions:
expr: _col0
@@ -1163,16 +1221,22 @@ STAGE PLANS:
bucketGroup: false
mode: mergepartial
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
Select Operator
expressions:
expr: _col0
type: bigint
outputColumnNames: _col0
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
+ Statistics:
+ numRows: 1 dataSize: 8 basicStatsState: COMPLETE colStatsState: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.TextInputFormat