You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by th...@apache.org on 2013/10/31 22:22:02 UTC
svn commit: r1537667 [1/3] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/ conf/
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/java/org/apache/hadoop/hive/ql/metadata/
ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org...
Author: thejas
Date: Thu Oct 31 21:22:02 2013
New Revision: 1537667
URL: http://svn.apache.org/r1537667
Log:
HIVE-5483 : use metastore statistics to optimize max/min/etc. queries (Ashutosh Chauhan via Thejas Nair)
Added:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q
hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q
hive/trunk/ql/src/test/results/clientpositive/metadata_only_queries.q.out
hive/trunk/ql/src/test/results/clientpositive/stats_only_null.q.out
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/conf/hive-default.xml.template
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Thu Oct 31 21:22:02 2013
@@ -658,6 +658,8 @@ public class HiveConf extends Configurat
HIVEFETCHTASKAGGR("hive.fetch.task.aggr", false),
+ HIVEOPTIMIZEMETADATAQUERIES("hive.compute.query.using.stats", false),
+
// Serde for FetchTask
HIVEFETCHOUTPUTSERDE("hive.fetch.output.serde", "org.apache.hadoop.hive.serde2.DelimitedJSONSerDe"),
Modified: hive/trunk/conf/hive-default.xml.template
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Thu Oct 31 21:22:02 2013
@@ -2032,6 +2032,17 @@
</property>
<property>
+ <name>hive.compute.query.using.stats</name>
+ <value>false</value>
+ <description>
+ When set to true hive will answer few queries like count(1) purely using stats
+ stored in metastore. For basic stats collection turn on the config hive.stats.autogather to true.
+ For more advanced stats collection need to run analyze table queries.
+ </description>
+</property>
+
+
+<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
<description>
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java Thu Oct 31 21:22:02 2013
@@ -498,6 +498,13 @@ public class FetchOperator implements Se
* Currently only used by FetchTask.
**/
public boolean pushRow() throws IOException, HiveException {
+ if(work.getRowsComputedUsingStats() != null) {
+ for (List<Object> row : work.getRowsComputedUsingStats()) {
+ operator.process(row, 0);
+ }
+ operator.flush();
+ return true;
+ }
InspectableObject row = getNextRow();
if (row != null) {
pushRow(row);
@@ -609,6 +616,9 @@ public class FetchOperator implements Se
* returns output ObjectInspector, never null
*/
public ObjectInspector getOutputObjectInspector() throws HiveException {
+ if(null != work.getStatRowOI()) {
+ return work.getStatRowOI();
+ }
try {
if (work.isNotPartitioned()) {
return getRowInspectorFromTable(work.getTblDesc());
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java Thu Oct 31 21:22:02 2013
@@ -48,6 +48,8 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.common.classification.InterfaceAudience.LimitedPrivate;
+import org.apache.hadoop.hive.common.classification.InterfaceStability.Unstable;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.HiveMetaException;
@@ -1730,7 +1732,7 @@ private void constructOneLBLocationMap(F
* @param tbl table for which partitions are needed
* @return list of partition objects
*/
- public Set<Partition> getAllPartitionsForPruner(Table tbl) throws HiveException {
+ public Set<Partition> getAllPartitionsOf(Table tbl) throws HiveException {
if (!tbl.isPartitioned()) {
return Sets.newHashSet(new Partition(tbl));
}
@@ -2405,21 +2407,13 @@ private void constructOneLBLocationMap(F
HiveMetaStoreClient.class.getName());
}
- /*
- * This api just sets up a metastore client. This is used for
- * pre-launching the metastore client so as to reduce latency
- * within a single session.
- */
- public void setupMSC() throws MetaException {
- getMSC();
- }
-
/**
- *
* @return the metastore client for the current thread
* @throws MetaException
*/
- private IMetaStoreClient getMSC() throws MetaException {
+ @LimitedPrivate(value = {"Hive"})
+ @Unstable
+ public IMetaStoreClient getMSC() throws MetaException {
if (metaStoreClient == null) {
metaStoreClient = createMetaStoreClient();
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Thu Oct 31 21:22:02 2013
@@ -111,6 +111,9 @@ public class Optimizer {
if (HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE) > 0) {
transformations.add(new LimitPushdownOptimizer());
}
+ if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTIMIZEMETADATAQUERIES)) {
+ transformations.add(new StatsOptimizer());
+ }
transformations.add(new SimpleFetchOptimizer()); // must be called last
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEFETCHTASKAGGR)) {
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java?rev=1537667&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java Thu Oct 31 21:22:02 2013
@@ -0,0 +1,431 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.FetchTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.FetchWork;
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+/** There is a set of queries which can be answered entirely from statistics stored in metastore.
+ * Examples of such queries are count(*), count(a), max(a), min(b) etc. Hive already collects
+ * these basic statistics for query planning purposes. These same statistics can be used to
+ * answer queries also.
+ *
+ * Optimizer looks at query plan to determine if it can answer query using statistics
+ * and than change the plan to answer query entirely using statistics stored in metastore.
+ */
+public class StatsOptimizer implements Transform {
+
+ private static final Log Log = LogFactory.getLog(StatsOptimizer.class);
+
+ @Override
+ public ParseContext transform(ParseContext pctx) throws SemanticException {
+
+ if (pctx.getFetchTask() != null || !pctx.getQB().getIsQuery() ||
+ pctx.getQB().isAnalyzeRewrite() || pctx.getQB().isCTAS() ||
+ pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()) {
+ return pctx;
+ }
+
+ String TS = TableScanOperator.getOperatorName() + "%";
+ String GBY = GroupByOperator.getOperatorName() + "%";
+ String RS = ReduceSinkOperator.getOperatorName() + "%";
+ String SEL = SelectOperator.getOperatorName() + "%";
+ String FS = FileSinkOperator.getOperatorName() + "%";
+
+ Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
+ opRules.put(new RuleRegExp("R1", TS + SEL + GBY + RS + GBY + SEL + FS),
+ new MetaDataProcessor(pctx));
+
+ Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null);
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ ArrayList<Node> topNodes = new ArrayList<Node>();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+ return pctx;
+ }
+
+ private static class MetaDataProcessor implements NodeProcessor {
+
+ private final ParseContext pctx;
+
+ public MetaDataProcessor (ParseContext pctx) {
+ this.pctx = pctx;
+ }
+
+ enum StatType{
+ Integeral,
+ Double,
+ String,
+ Boolean,
+ Binary,
+ Unsupported
+ }
+
+ private StatType getType(String origType) {
+ if (serdeConstants.IntegralTypes.contains(origType)) {
+ return StatType.Integeral;
+ } else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) ||
+ origType.equals(serdeConstants.FLOAT_TYPE_NAME)) {
+ return StatType.Double;
+ } else if (origType.equals(serdeConstants.BINARY_TYPE_NAME)) {
+ return StatType.Binary;
+ } else if (origType.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
+ return StatType.Boolean;
+ } else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) {
+ return StatType.String;
+ }
+ return StatType.Unsupported;
+ }
+
+ private Long getNullcountFor(StatType type, ColumnStatisticsData statData) {
+
+ switch(type) {
+ case Integeral :
+ return statData.getLongStats().getNumNulls();
+ case Double:
+ return statData.getDoubleStats().getNumNulls();
+ case String:
+ return statData.getStringStats().getNumNulls();
+ case Boolean:
+ return statData.getBooleanStats().getNumNulls();
+ case Binary:
+ return statData.getBinaryStats().getNumNulls();
+ default:
+ return null;
+ }
+ }
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+
+ // 1. Do few checks to determine eligibility of optimization
+ // 2. look at ExprNodeFuncGenericDesc in select list to see if its min, max, count etc.
+ // If it is
+ // 3. Connect to metastore and get the stats
+ // 4. Compose rows and add it in FetchWork
+ // 5. Delete GBY - RS - GBY - SEL from the pipeline.
+
+ TableScanOperator tsOp = (TableScanOperator) stack.get(0);
+ if(tsOp.getParentOperators() != null && tsOp.getParentOperators().size() > 0) {
+ // looks like a subq plan.
+ return null;
+ }
+ SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0);
+ for(ExprNodeDesc desc : selOp.getConf().getColList()) {
+ if (!(desc instanceof ExprNodeColumnDesc)) {
+ // Probably an expression, cant handle that
+ return null;
+ }
+ }
+ // Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS
+ // we need not to do any instanceof checks for following.
+ GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0);
+ FileSinkOperator fsOp = (FileSinkOperator)(gbyOp.getChildren().get(0).
+ getChildren().get(0).getChildren().get(0).getChildren().get(0));
+ if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() > 0) {
+ // looks like a subq plan.
+ return null;
+ }
+ List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators();
+
+ Table tbl = pctx.getTopToTable().get(tsOp);
+ List<Object> oneRow = new ArrayList<Object>();
+ List<ObjectInspector> ois = new ArrayList<ObjectInspector>();
+ try{
+ Hive hive = Hive.get(pctx.getConf());
+
+ for (AggregationDesc aggr : aggrs) {
+ if (aggr.getGenericUDAFName().equals(GenericUDAFCount.class.getAnnotation(
+ Description.class).name())) {
+ long rowCnt = 0;
+ if ((aggr.getParameters().isEmpty() || aggr.getParameters().get(0) instanceof
+ ExprNodeConstantDesc)) {
+ // Its either count (*) or count(1) case
+ if(tbl.isPartitioned()) {
+ for (Partition part : hive.getAllPartitionsOf(tbl)) {
+ long partRowCnt = Long.parseLong(part.getParameters()
+ .get(StatsSetupConst.ROW_COUNT));
+ if (partRowCnt < 1) {
+ Log.debug("Partition doesn't have upto date stats " + part.getSpec());
+ return null;
+ }
+ rowCnt += partRowCnt;
+ }
+ } else { // unpartitioned table
+ rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+ if (rowCnt < 1) {
+ // if rowCnt < 1 than its either empty table or table on which stats are not
+ // computed We assume the worse and don't attempt to optimize.
+ Log.debug("Table doesn't have upto date stats " + tbl.getTableName());
+ return null;
+ }
+ }
+ } else {
+ // Its count(col) case
+ if (!(aggr.getParameters().get(0) instanceof ExprNodeColumnDesc)) {
+ // this is weird, we got expr or something in there, bail out
+ Log.debug("Unexpected expression : " + aggr.getParameters().get(0));
+ return null;
+ }
+ ExprNodeColumnDesc desc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
+ String colName = desc.getColumn();
+ StatType type = getType(desc.getTypeString());
+ if(!tbl.isPartitioned()) {
+ rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+ if (rowCnt < 1) {
+ Log.debug("Table doesn't have upto date stats " + tbl.getTableName());
+ return null;
+ }
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(),colName).
+ getStatsObjIterator().next().getStatsData();
+ Long nullCnt = getNullcountFor(type, statData);
+ if (null == nullCnt) {
+ Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+ } else {
+ rowCnt -= nullCnt;
+ }
+ } else {
+ for (Partition part : hive.getAllPartitionsOf(tbl)) {
+ Long partRowCnt = Long.parseLong(part.getParameters()
+ .get(StatsSetupConst.ROW_COUNT));
+ if (partRowCnt < 1) {
+ Log.debug("Partition doesn't have upto date stats " + part.getSpec());
+ return null;
+ }
+ rowCnt += partRowCnt;
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics(
+ tbl.getDbName(), tbl.getTableName(),part.getName(), colName)
+ .getStatsObjIterator().next().getStatsData();
+ Long nullCnt = getNullcountFor(type, statData);
+ if(nullCnt == null) {
+ Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+ } else {
+ rowCnt -= nullCnt;
+ }
+ }
+ }
+ }
+ oneRow.add(rowCnt);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ } else if (aggr.getGenericUDAFName().equals(GenericUDAFMax.class.getAnnotation(
+ Description.class).name())) {
+ ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
+ String colName = colDesc.getColumn();
+ StatType type = getType(colDesc.getTypeString());
+ if(!tbl.isPartitioned()) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(),colName).
+ getStatsObjIterator().next().getStatsData();
+ switch (type) {
+ case Integeral:
+ oneRow.add(statData.getLongStats().getHighValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ case Double:
+ oneRow.add(statData.getDoubleStats().getHighValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ default:
+ // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+ }
+ } else {
+ List<String> parts = hive.getMSC().listPartitionNames(tbl.getDbName(),
+ tbl.getTableName(), (short)-1);
+ switch(type) {
+ case Integeral: {
+ long maxVal = Long.MIN_VALUE;
+ for (String part : parts) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(), part, colName).
+ getStatsObjIterator().next().getStatsData();
+ maxVal = Math.max(maxVal,statData.getLongStats().getHighValue());
+ }
+ oneRow.add(maxVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ }
+ case Double: {
+ double maxVal = Double.MIN_VALUE;
+ for (String part : parts) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(), part, colName).
+ getStatsObjIterator().next().getStatsData();
+ maxVal = Math.max(maxVal,statData.getDoubleStats().getHighValue());
+ }
+ oneRow.add(maxVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ }
+ default:
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+ }
+ }
+ } else if (aggr.getGenericUDAFName().equals(GenericUDAFMin.class.getAnnotation(
+ Description.class).name())) {
+ ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
+ String colName = colDesc.getColumn();
+ StatType type = getType(colDesc.getTypeString());
+ if (!tbl.isPartitioned()) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(),colName).
+ getStatsObjIterator().next().getStatsData();
+ switch (type) {
+ case Integeral:
+ oneRow.add(statData.getLongStats().getLowValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ case Double:
+ oneRow.add(statData.getDoubleStats().getLowValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ default: // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+ }
+ } else {
+ List<String> parts = hive.getMSC().listPartitionNames(tbl.getDbName(),
+ tbl.getTableName(), (short)-1);
+ switch(type) {
+ case Integeral: {
+ long minVal = Long.MAX_VALUE;
+ for (String part : parts) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(), part, colName).
+ getStatsObjIterator().next().getStatsData();
+ minVal = Math.min(minVal,statData.getLongStats().getLowValue());
+ }
+ oneRow.add(minVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ }
+ case Double: {
+ double minVal = Double.MAX_VALUE;
+ for (String part : parts) {
+ //TODO: After HIVE-3777 use the property to figure out if following
+ // stats is fresh or not.
+ ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics(
+ tbl.getDbName(),tbl.getTableName(), part, colName).
+ getStatsObjIterator().next().getStatsData();
+ minVal = Math.min(minVal,statData.getDoubleStats().getLowValue());
+ }
+ oneRow.add(minVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ }
+ default: // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
+
+ }
+ }
+ } else { // Unsupported aggregation.
+ Log.debug("Unsupported aggregation for metadata optimizer: "
+ + aggr.getGenericUDAFName());
+ return null;
+ }
+ }
+ } catch (Exception e) {
+ // this is best effort optimization, bail out in error conditions and
+ // try generate and execute slower plan
+ Log.debug("Failed to optimize using metadata optimizer", e);
+ return null;
+ }
+
+ List<List<Object>> allRows = new ArrayList<List<Object>>();
+ allRows.add(oneRow);
+
+ List<String> colNames = new ArrayList<String>();
+ for (ColumnInfo colInfo: gbyOp.getSchema().getSignature()) {
+ colNames.add(colInfo.getInternalName());
+ }
+ StandardStructObjectInspector sOI = ObjectInspectorFactory.
+ getStandardStructObjectInspector(colNames, ois);
+ FetchWork fWork = new FetchWork(allRows, sOI);
+ FetchTask fTask = (FetchTask)TaskFactory.get(fWork, pctx.getConf());
+ fWork.setLimit(allRows.size());
+ pctx.setFetchTask(fTask);
+
+ return null;
+ }
+ }
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java Thu Oct 31 21:22:02 2013
@@ -337,7 +337,7 @@ public class PartitionPruner implements
private static Set<Partition> getAllPartitions(Table tab) throws HiveException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
- Set<Partition> result = Hive.get().getAllPartitionsForPruner(tab);
+ Set<Partition> result = Hive.get().getAllPartitionsOf(tab);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
return result;
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java Thu Oct 31 21:22:02 2013
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.parse.SplitSample;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
/**
* FetchWork.
@@ -50,6 +51,9 @@ public class FetchWork implements Serial
private SplitSample splitSample;
+ private transient List<List<Object>> rowsComputedFromStats;
+ private transient ObjectInspector statRowOI;
+
/**
* Serialization Null Format for the serde used to fetch data.
*/
@@ -58,6 +62,19 @@ public class FetchWork implements Serial
public FetchWork() {
}
+ public FetchWork(List<List<Object>> rowsComputedFromStats,ObjectInspector statRowOI) {
+ this.rowsComputedFromStats = rowsComputedFromStats;
+ this.statRowOI = statRowOI;
+ }
+
+ public ObjectInspector getStatRowOI() {
+ return statRowOI;
+ }
+
+ public List<List<Object>> getRowsComputedUsingStats() {
+ return rowsComputedFromStats;
+ }
+
public FetchWork(String tblDir, TableDesc tblDesc) {
this(tblDir, tblDesc, -1);
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java Thu Oct 31 21:22:02 2013
@@ -33,8 +33,6 @@ import java.util.Map;
import java.util.Set;
import java.util.UUID;
-import javax.security.auth.login.LoginException;
-
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
@@ -281,7 +279,7 @@ public class SessionState {
// Get the following out of the way when you start the session these take a
// while and should be done when we start up.
try {
- Hive.get(startSs.conf).setupMSC();
+ Hive.get(startSs.conf).getMSC();
ShimLoader.getHadoopShims().getUGIForConf(startSs.conf);
FileSystem.get(startSs.conf);
} catch (Exception e) {
@@ -289,7 +287,7 @@ public class SessionState {
// that would cause ClassNoFoundException otherwise
throw new RuntimeException(e);
}
-
+
try {
startSs.authenticator = HiveUtils.getAuthenticator(
startSs.getConf(),HiveConf.ConfVars.HIVE_AUTHENTICATOR_MANAGER);
Added: hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q?rev=1537667&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q Thu Oct 31 21:22:02 2013
@@ -0,0 +1,75 @@
+set hive.compute.query.using.stats=true;
+set hive.stats.autogather=true;
+create table over10k(
+ t tinyint,
+ si smallint,
+ i int,
+ b bigint,
+ f float,
+ d double,
+ bo boolean,
+ s string,
+ ts timestamp,
+ dec decimal,
+ bin binary)
+ row format delimited
+ fields terminated by '|';
+
+load data local inpath '../data/files/over10k' into table over10k;
+
+create table stats_tbl(
+ t tinyint,
+ si smallint,
+ i int,
+ b bigint,
+ f float,
+ d double,
+ bo boolean,
+ s string,
+ ts timestamp,
+ dec decimal,
+ bin binary);
+
+create table stats_tbl_part(
+ t tinyint,
+ si smallint,
+ i int,
+ b bigint,
+ f float,
+ d double,
+ bo boolean,
+ s string,
+ ts timestamp,
+ dec decimal,
+ bin binary) partitioned by (dt string);
+
+
+insert overwrite table stats_tbl select * from over10k;
+
+insert into table stats_tbl_part partition (dt='2010') select * from over10k where t>0 and t<30;
+insert into table stats_tbl_part partition (dt='2011') select * from over10k where t>30 and t<60;
+insert into table stats_tbl_part partition (dt='2012') select * from over10k where t>60;
+
+explain
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b) from stats_tbl;
+explain
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b) from stats_tbl_part;
+
+analyze table stats_tbl compute statistics for columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2010') compute statistics for columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2011') compute statistics for columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2012') compute statistics for columns t,si,i,b,f,d,bo,s,bin;
+
+explain
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl;
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl;
+explain
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl_part;
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl_part;
+
+explain select count(ts) from stats_tbl_part;
+
+drop table stats_tbl;
+drop table stats_tbl_part;
+
+set hive.compute.query.using.stats=false;
Added: hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q?rev=1537667&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q Thu Oct 31 21:22:02 2013
@@ -0,0 +1,36 @@
+set hive.compute.query.using.stats=true;
+set hive.stats.autogather=true;
+CREATE TABLE temps_null(a double, b int, c STRING, d smallint) STORED AS TEXTFILE;
+
+CREATE TABLE stats_null(a double, b int, c STRING, d smallint) STORED AS TEXTFILE;
+
+CREATE TABLE stats_null_part(a double, b int, c STRING, d smallint) partitioned by (dt string) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE temps_null;
+
+insert overwrite table stats_null select * from temps_null;
+insert overwrite table stats_null_part partition(dt='2010') select * from temps_null where d <=5;
+
+insert overwrite table stats_null_part partition(dt='2011') select * from temps_null where d > 5;
+explain
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+explain
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+
+
+analyze table stats_null compute statistics for columns a,b,c,d;
+analyze table stats_null_part partition(dt='2010') compute statistics for columns a,b,c,d;
+analyze table stats_null_part partition(dt='2011') compute statistics for columns a,b,c,d;
+
+explain
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+explain
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+
+
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+drop table stats_null;
+drop table stats_null_part;
+drop table temps_null;
+set hive.compute.query.using.stats=false;