You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rh...@apache.org on 2014/09/06 01:45:06 UTC
svn commit: r1622819 - in
/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql: optimizer/optiq/
optimizer/optiq/stats/ parse/
Author: rhbutani
Date: Fri Sep 5 23:45:05 2014
New Revision: 1622819
URL: http://svn.apache.org/r1622819
Log:
HIVE-7915 CBO: more cost model changes (Harish Butani via John Pullokkaran)
Added:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdRowCount.java
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdUniqueKeys.java
Modified:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java?rev=1622819&r1=1622818&r2=1622819&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java (original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java Fri Sep 5 23:45:05 2014
@@ -3,7 +3,9 @@ package org.apache.hadoop.hive.ql.optimi
import com.google.common.collect.ImmutableList;
import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdDistinctRowCount;
+import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdRowCount;
import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdSelectivity;
+import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdUniqueKeys;
import org.eigenbase.rel.metadata.ChainedRelMetadataProvider;
import org.eigenbase.rel.metadata.DefaultRelMetadataProvider;
import org.eigenbase.rel.metadata.RelMetadataProvider;
@@ -23,5 +25,7 @@ public class HiveDefaultRelMetadataProvi
public static final RelMetadataProvider INSTANCE = ChainedRelMetadataProvider.of(ImmutableList
.of(HiveRelMdDistinctRowCount.SOURCE,
HiveRelMdSelectivity.SOURCE,
+ HiveRelMdRowCount.SOURCE,
+ HiveRelMdUniqueKeys.SOURCE,
new DefaultRelMetadataProvider()));
}
Added: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdRowCount.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdRowCount.java?rev=1622819&view=auto
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdRowCount.java (added)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdRowCount.java Fri Sep 5 23:45:05 2014
@@ -0,0 +1,420 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.optiq.stats;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Set;
+
+import net.hydromatic.optiq.BuiltinMethod;
+import net.hydromatic.optiq.util.BitSets;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.eigenbase.rel.FilterRelBase;
+import org.eigenbase.rel.JoinRelBase;
+import org.eigenbase.rel.JoinRelType;
+import org.eigenbase.rel.ProjectRelBase;
+import org.eigenbase.rel.RelNode;
+import org.eigenbase.rel.RelVisitor;
+import org.eigenbase.rel.TableAccessRelBase;
+import org.eigenbase.rel.metadata.ReflectiveRelMetadataProvider;
+import org.eigenbase.rel.metadata.RelMdRowCount;
+import org.eigenbase.rel.metadata.RelMetadataProvider;
+import org.eigenbase.rel.metadata.RelMetadataQuery;
+import org.eigenbase.rel.rules.SemiJoinRel;
+import org.eigenbase.relopt.RelOptUtil;
+import org.eigenbase.rex.RexBuilder;
+import org.eigenbase.rex.RexCall;
+import org.eigenbase.rex.RexInputRef;
+import org.eigenbase.rex.RexNode;
+import org.eigenbase.rex.RexUtil;
+import org.eigenbase.sql.fun.SqlStdOperatorTable;
+import org.eigenbase.util.Holder;
+import org.eigenbase.util.Pair;
+
+public class HiveRelMdRowCount extends RelMdRowCount {
+
+ protected static final Log LOG = LogFactory.getLog(HiveRelMdRowCount.class.getName());
+
+
+ public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider
+ .reflectiveSource(BuiltinMethod.ROW_COUNT.method, new HiveRelMdRowCount());
+
+ protected HiveRelMdRowCount() {
+ super();
+ }
+
+ public Double getRowCount(JoinRelBase join) {
+ PKFKRelationInfo pkfk = analyzeJoinForPKFK(join);
+ if (pkfk != null) {
+ double selectivity = (pkfk.pkInfo.selectivity * pkfk.ndvScalingFactor);
+ selectivity = Math.min(1.0, selectivity);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Identified Primary - Foreign Key relation:");
+ LOG.debug(RelOptUtil.toString(join));
+ LOG.debug(pkfk);
+ }
+ return pkfk.fkInfo.rowCount * selectivity;
+ }
+ return join.getRows();
+ }
+
+ public Double getRowCount(SemiJoinRel rel) {
+ PKFKRelationInfo pkfk = analyzeJoinForPKFK(rel);
+ if (pkfk != null) {
+ double selectivity = (pkfk.pkInfo.selectivity * pkfk.ndvScalingFactor);
+ selectivity = Math.min(1.0, selectivity);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Identified Primary - Foreign Key relation:");
+ LOG.debug(RelOptUtil.toString(rel));
+ LOG.debug(pkfk);
+ }
+ return pkfk.fkInfo.rowCount * selectivity;
+ }
+ return super.getRowCount(rel);
+ }
+
+ static class PKFKRelationInfo {
+ public final int fkSide;
+ public final double ndvScalingFactor;
+ public final FKSideInfo fkInfo;
+ public final PKSideInfo pkInfo;
+ public final boolean isPKSideSimple;
+
+ PKFKRelationInfo(int fkSide,
+ FKSideInfo fkInfo,
+ PKSideInfo pkInfo,
+ double ndvScalingFactor,
+ boolean isPKSideSimple) {
+ this.fkSide = fkSide;
+ this.fkInfo = fkInfo;
+ this.pkInfo = pkInfo;
+ this.ndvScalingFactor = ndvScalingFactor;
+ this.isPKSideSimple = isPKSideSimple;
+ }
+
+ public String toString() {
+ return String.format(
+ "Primary - Foreign Key join:\n\tfkSide = %d\n\tFKInfo:%s\n" +
+ "\tPKInfo:%s\n\tisPKSideSimple:%s\n\tNDV Scaling Factor:%.2f\n",
+ fkSide,
+ fkInfo,
+ pkInfo,
+ isPKSideSimple,
+ ndvScalingFactor);
+ }
+ }
+
+ static class FKSideInfo {
+ public final double rowCount;
+ public final double distinctCount;
+ public FKSideInfo(double rowCount, double distinctCount) {
+ this.rowCount = rowCount;
+ this.distinctCount = distinctCount;
+ }
+
+ public String toString() {
+ return String.format("FKInfo(rowCount=%.2f,ndv=%.2f)", rowCount, distinctCount);
+ }
+ }
+
+ static class PKSideInfo extends FKSideInfo {
+ public final double selectivity;
+ public PKSideInfo(double rowCount, double distinctCount, double selectivity) {
+ super(rowCount, distinctCount);
+ this.selectivity = selectivity;
+ }
+
+ public String toString() {
+ return String.format("PKInfo(rowCount=%.2f,ndv=%.2f,selectivity=%.2f)", rowCount, distinctCount,selectivity);
+ }
+ }
+
+ /*
+ * For T1 join T2 on T1.x = T2.y if we identify 'y' s a key of T2 then we can
+ * infer the join cardinality as: rowCount(T1) * selectivity(T2) i.e this is
+ * like a SemiJoin where the T1(Fact side/FK side) is filtered by a factor
+ * based on the Selectivity of the PK/Dim table side.
+ *
+ * 1. If both T1.x and T2.y are keys then use the larger one as the PK side.
+ * 2. In case of outer Joins: a) The FK side should be the Null Preserving
+ * side. It doesn't make sense to apply this heuristic in case of Dim loj Fact
+ * or Fact roj Dim b) The selectivity factor applied on the Fact Table should
+ * be 1.
+ */
+ public static PKFKRelationInfo analyzeJoinForPKFK(JoinRelBase joinRel) {
+
+ RelNode left = joinRel.getInputs().get(0);
+ RelNode right = joinRel.getInputs().get(1);
+
+ final List<RexNode> initJoinFilters = RelOptUtil.conjunctions(joinRel
+ .getCondition());
+
+ /*
+ * No joining condition.
+ */
+ if (initJoinFilters.isEmpty()) {
+ return null;
+ }
+
+ List<RexNode> leftFilters = new ArrayList<RexNode>();
+ List<RexNode> rightFilters = new ArrayList<RexNode>();
+ List<RexNode> joinFilters = new ArrayList<RexNode>(initJoinFilters);
+ final Holder<JoinRelType> joinTypeHolder = Holder.of(joinRel.getJoinType());
+
+ // @todo: remove this. 8/28/14 hb
+ // for now adding because RelOptUtil.classifyFilters has an assertion about
+ // column counts that is not true for semiJoins.
+ if (joinRel instanceof SemiJoinRel) {
+ return null;
+ }
+
+ RelOptUtil.classifyFilters(joinRel, joinFilters, joinRel.getJoinType(),
+ false, !joinRel.getJoinType().generatesNullsOnRight(), !joinRel
+ .getJoinType().generatesNullsOnLeft(), joinFilters, leftFilters,
+ rightFilters, joinTypeHolder, false);
+
+ Pair<Integer, Integer> joinCols = canHandleJoin(joinRel, leftFilters,
+ rightFilters, joinFilters);
+ if (joinCols == null) {
+ return null;
+ }
+ int leftColIdx = joinCols.left;
+ int rightColIdx = joinCols.right;
+
+ RexBuilder rexBuilder = joinRel.getCluster().getRexBuilder();
+ RexNode leftPred = RexUtil
+ .composeConjunction(rexBuilder, leftFilters, true);
+ RexNode rightPred = RexUtil.composeConjunction(rexBuilder, rightFilters,
+ true);
+ BitSet lBitSet = BitSets.of(leftColIdx);
+ BitSet rBitSet = BitSets.of(rightColIdx);
+
+ /*
+ * If the form is Dim loj F or Fact roj Dim or Dim semij Fact then return
+ * null.
+ */
+ boolean leftIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel
+ .getJoinType() == JoinRelType.RIGHT)
+ && !(joinRel instanceof SemiJoinRel) && isKey(lBitSet, left);
+ boolean rightIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel
+ .getJoinType() == JoinRelType.LEFT) && isKey(rBitSet, right);
+
+ if (!leftIsKey && !rightIsKey) {
+ return null;
+ }
+
+ double leftRowCount = RelMetadataQuery.getRowCount(left)
+ * RelMetadataQuery.getSelectivity(left, leftPred);
+ double rightRowCount = RelMetadataQuery.getRowCount(right)
+ * RelMetadataQuery.getSelectivity(right, rightPred);
+
+ if (leftIsKey && rightIsKey) {
+ if (rightRowCount < leftRowCount) {
+ leftIsKey = false;
+ }
+ }
+
+ int pkSide = leftIsKey ? 0 : rightIsKey ? 1 : -1;
+
+ boolean isPKSideSimpleTree = pkSide != -1 ?
+ IsSimpleTreeOnJoinKey.check(
+ pkSide == 0 ? left : right,
+ pkSide == 0 ? leftColIdx : rightColIdx) : false;
+
+ double leftNDV = isPKSideSimpleTree ? RelMetadataQuery.getDistinctRowCount(left, lBitSet, leftPred) : -1;
+ double rightNDV = isPKSideSimpleTree ? RelMetadataQuery.getDistinctRowCount(right, rBitSet, rightPred) : -1;
+
+ /*
+ * If the ndv of the PK - FK side don't match, and the PK side is a filter
+ * on the Key column then scale the NDV on the FK side.
+ *
+ * As described by Peter Boncz: http://databasearchitects.blogspot.com/
+ * in such cases we can be off by a large margin in the Join cardinality
+ * estimate. The e.g. he provides is on the join of StoreSales and DateDim
+ * on the TPCDS dataset. Since the DateDim is populated for 20 years into
+ * the future, while the StoreSales only has 5 years worth of data, there
+ * are 40 times fewer distinct dates in StoreSales.
+ *
+ * In general it is hard to infer the range for the foreign key on an
+ * arbitrary expression. For e.g. the NDV for DayofWeek is the same
+ * irrespective of NDV on the number of unique days, whereas the
+ * NDV of Quarters has the same ratio as the NDV on the keys.
+ *
+ * But for expressions that apply only on columns that have the same NDV
+ * as the key (implying that they are alternate keys) we can apply the
+ * ratio. So in the case of StoreSales - DateDim joins for predicate on the
+ * d_date column we can apply the scaling factor.
+ */
+ double ndvScalingFactor = 1.0;
+ if ( isPKSideSimpleTree ) {
+ ndvScalingFactor = pkSide == 0 ? leftNDV/rightNDV : rightNDV / leftNDV;
+ }
+
+ if (pkSide == 0) {
+ FKSideInfo fkInfo = new FKSideInfo(rightRowCount,
+ rightNDV);
+ PKSideInfo pkInfo = new PKSideInfo(leftRowCount,
+ leftNDV,
+ joinRel.getJoinType().generatesNullsOnRight() ? 1.0 :
+ RelMetadataQuery.getSelectivity(left, leftPred));
+
+ return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
+ }
+
+ if (pkSide == 1) {
+ FKSideInfo fkInfo = new FKSideInfo(leftRowCount,
+ leftNDV);
+ PKSideInfo pkInfo = new PKSideInfo(rightRowCount,
+ rightNDV,
+ joinRel.getJoinType().generatesNullsOnLeft() ? 1.0 :
+ RelMetadataQuery.getSelectivity(right, rightPred));
+
+ return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
+ }
+
+ return null;
+ }
+
+ private static boolean isKey(BitSet c, RelNode rel) {
+ boolean isKey = false;
+ Set<BitSet> keys = RelMetadataQuery.getUniqueKeys(rel);
+ if (keys != null) {
+ for (BitSet key : keys) {
+ if (key.equals(c)) {
+ isKey = true;
+ break;
+ }
+ }
+ }
+ return isKey;
+ }
+
+ /*
+ * 1. Join condition must be an Equality Predicate.
+ * 2. both sides must reference 1 column.
+ * 3. If needed flip the columns.
+ */
+ private static Pair<Integer, Integer> canHandleJoin(JoinRelBase joinRel,
+ List<RexNode> leftFilters, List<RexNode> rightFilters,
+ List<RexNode> joinFilters) {
+
+ /*
+ * If after classifying filters there is more than 1 joining predicate, we
+ * don't handle this. Return null.
+ */
+ if (joinFilters.size() != 1) {
+ return null;
+ }
+
+ RexNode joinCond = joinFilters.get(0);
+
+ int leftColIdx;
+ int rightColIdx;
+
+ if (!(joinCond instanceof RexCall)) {
+ return null;
+ }
+
+ if (((RexCall) joinCond).getOperator() != SqlStdOperatorTable.EQUALS) {
+ return null;
+ }
+
+ BitSet leftCols = RelOptUtil.InputFinder.bits(((RexCall) joinCond).getOperands().get(0));
+ BitSet rightCols = RelOptUtil.InputFinder.bits(((RexCall) joinCond).getOperands().get(1));
+
+ if (leftCols.cardinality() != 1 || rightCols.cardinality() != 1 ) {
+ return null;
+ }
+
+ int nFieldsLeft = joinRel.getLeft().getRowType().getFieldList().size();
+ int nFieldsRight = joinRel.getRight().getRowType().getFieldList().size();
+ int nSysFields = joinRel.getSystemFieldList().size();
+ BitSet rightFieldsBitSet = BitSets.range(nSysFields + nFieldsLeft,
+ nSysFields + nFieldsLeft + nFieldsRight);
+ /*
+ * flip column references if join condition specified in reverse order to
+ * join sources.
+ */
+ if (BitSets.contains(rightFieldsBitSet, leftCols)) {
+ BitSet t = leftCols;
+ leftCols = rightCols;
+ rightCols = t;
+ }
+
+ leftColIdx = leftCols.nextSetBit(0) - nSysFields;
+ rightColIdx = rightCols.nextSetBit(0) - (nSysFields + nFieldsLeft);
+
+ return new Pair<Integer, Integer>(leftColIdx, rightColIdx);
+ }
+
+ private static class IsSimpleTreeOnJoinKey extends RelVisitor {
+
+ int joinKey;
+ boolean simpleTree;
+
+ static boolean check(RelNode r, int joinKey) {
+ IsSimpleTreeOnJoinKey v = new IsSimpleTreeOnJoinKey(joinKey);
+ v.go(r);
+ return v.simpleTree;
+ }
+
+ IsSimpleTreeOnJoinKey(int joinKey) {
+ super();
+ this.joinKey = joinKey;
+ simpleTree = true;
+ }
+
+ @Override
+ public void visit(RelNode node, int ordinal, RelNode parent) {
+
+ if (node instanceof TableAccessRelBase) {
+ simpleTree = true;
+ } else if (node instanceof ProjectRelBase) {
+ simpleTree = isSimple((ProjectRelBase) node);
+ } else if (node instanceof FilterRelBase) {
+ simpleTree = isSimple((FilterRelBase) node);
+ } else {
+ simpleTree = false;
+ }
+
+ if (simpleTree) {
+ super.visit(node, ordinal, parent);
+ }
+ }
+
+ private boolean isSimple(ProjectRelBase project) {
+ RexNode r = project.getProjects().get(joinKey);
+ if (r instanceof RexInputRef) {
+ joinKey = ((RexInputRef) r).getIndex();
+ return true;
+ }
+ return false;
+ }
+
+ private boolean isSimple(FilterRelBase filter) {
+ BitSet condBits = RelOptUtil.InputFinder.bits(filter.getCondition());
+ return isKey(condBits, filter);
+ }
+
+ }
+
+}
Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java?rev=1622819&r1=1622818&r2=1622819&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java (original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java Fri Sep 5 23:45:05 2014
@@ -1,6 +1,7 @@
package org.apache.hadoop.hive.ql.optimizer.optiq.stats;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -77,59 +78,84 @@ public class HiveRelMdSelectivity extend
List<JoinLeafPredicateInfo> peLst = jpi.getEquiJoinPredicateElements();
int noOfPE = peLst.size();
if (noOfPE > 0) {
- // 3.1 Use first conjunctive predicate element's NDV as the seed
- ndvCrossProduct = getMaxNDVForJoinSelectivity(peLst.get(0), colStatMap);
+ ndvCrossProduct = exponentialBackoff(peLst, colStatMap);
- // 3.2 if conjunctive predicate elements are more than one, then walk
- // through them one by one. Compute cross product of NDV. Cross product is
- // computed by multiplying the largest NDV of all of the conjunctive
- // predicate
- // elements with degraded NDV of rest of the conjunctive predicate
- // elements. NDV is
- // degraded using log function.Finally the ndvCrossProduct is fenced at
- // the join
- // cross product to ensure that NDV can not exceed worst case join
- // cardinality.<br>
- // NDV of a conjunctive predicate element is the max NDV of all arguments
- // to lhs, rhs expressions.
- // NDV(JoinCondition) = min (left cardinality * right cardinality,
- // ndvCrossProduct(JoinCondition))
- // ndvCrossProduct(JoinCondition) = ndv(pex)*log(ndv(pe1))*log(ndv(pe2))
- // where pex is the predicate element of join condition with max ndv.
- // ndv(pe) = max(NDV(left.Expr), NDV(right.Expr))
- // NDV(expr) = max(NDV( expr args))
- if (noOfPE > 1) {
- double maxNDVSoFar = ndvCrossProduct;
- double ndvToBeSmoothed;
- double tmpNDV;
-
- for (int i = 1; i < noOfPE; i++) {
- tmpNDV = getMaxNDVForJoinSelectivity(peLst.get(i), colStatMap);
- if (tmpNDV > maxNDVSoFar) {
- ndvToBeSmoothed = maxNDVSoFar;
- maxNDVSoFar = tmpNDV;
- ndvCrossProduct = (ndvCrossProduct / ndvToBeSmoothed) * tmpNDV;
- } else {
- ndvToBeSmoothed = tmpNDV;
- }
- // TODO: revisit the fence
- if (ndvToBeSmoothed > 3)
- ndvCrossProduct *= Math.log(ndvToBeSmoothed);
- else
- ndvCrossProduct *= ndvToBeSmoothed;
- }
+ if (j.isLeftSemiJoin())
+ ndvCrossProduct = Math.min(RelMetadataQuery.getRowCount(j.getLeft()),
+ ndvCrossProduct);
+ else
+ ndvCrossProduct = Math.min(RelMetadataQuery.getRowCount(j.getLeft())
+ * RelMetadataQuery.getRowCount(j.getRight()), ndvCrossProduct);
+ }
+
+ // 4. Join Selectivity = 1/NDV
+ return (1 / ndvCrossProduct);
+ }
- if (j.isLeftSemiJoin())
- ndvCrossProduct = Math.min(RelMetadataQuery.getRowCount(j.getLeft()),
- ndvCrossProduct);
+ // 3.2 if conjunctive predicate elements are more than one, then walk
+ // through them one by one. Compute cross product of NDV. Cross product is
+ // computed by multiplying the largest NDV of all of the conjunctive
+ // predicate
+ // elements with degraded NDV of rest of the conjunctive predicate
+ // elements. NDV is
+ // degraded using log function.Finally the ndvCrossProduct is fenced at
+ // the join
+ // cross product to ensure that NDV can not exceed worst case join
+ // cardinality.<br>
+ // NDV of a conjunctive predicate element is the max NDV of all arguments
+ // to lhs, rhs expressions.
+ // NDV(JoinCondition) = min (left cardinality * right cardinality,
+ // ndvCrossProduct(JoinCondition))
+ // ndvCrossProduct(JoinCondition) = ndv(pex)*log(ndv(pe1))*log(ndv(pe2))
+ // where pex is the predicate element of join condition with max ndv.
+ // ndv(pe) = max(NDV(left.Expr), NDV(right.Expr))
+ // NDV(expr) = max(NDV( expr args))
+ protected double logSmoothing(List<JoinLeafPredicateInfo> peLst, ImmutableMap<Integer, Double> colStatMap) {
+ int noOfPE = peLst.size();
+ double ndvCrossProduct = getMaxNDVForJoinSelectivity(peLst.get(0), colStatMap);
+ if (noOfPE > 1) {
+ double maxNDVSoFar = ndvCrossProduct;
+ double ndvToBeSmoothed;
+ double tmpNDV;
+
+ for (int i = 1; i < noOfPE; i++) {
+ tmpNDV = getMaxNDVForJoinSelectivity(peLst.get(i), colStatMap);
+ if (tmpNDV > maxNDVSoFar) {
+ ndvToBeSmoothed = maxNDVSoFar;
+ maxNDVSoFar = tmpNDV;
+ ndvCrossProduct = (ndvCrossProduct / ndvToBeSmoothed) * tmpNDV;
+ } else {
+ ndvToBeSmoothed = tmpNDV;
+ }
+ // TODO: revisit the fence
+ if (ndvToBeSmoothed > 3)
+ ndvCrossProduct *= Math.log(ndvToBeSmoothed);
else
- ndvCrossProduct = Math.min(RelMetadataQuery.getRowCount(j.getLeft())
- * RelMetadataQuery.getRowCount(j.getRight()), ndvCrossProduct);
+ ndvCrossProduct *= ndvToBeSmoothed;
}
}
+ return ndvCrossProduct;
+ }
- // 4. Join Selectivity = 1/NDV
- return (1 / ndvCrossProduct);
+ /*
+ * a) Order predciates based on ndv in reverse order. b) ndvCrossProduct =
+ * ndv(pe0) * ndv(pe1) ^(1/2) * ndv(pe2) ^(1/4) * ndv(pe3) ^(1/8) ...
+ */
+ protected double exponentialBackoff(List<JoinLeafPredicateInfo> peLst,
+ ImmutableMap<Integer, Double> colStatMap) {
+ int noOfPE = peLst.size();
+ List<Double> ndvs = new ArrayList<Double>(noOfPE);
+ for (int i = 0; i < noOfPE; i++) {
+ ndvs.add(getMaxNDVForJoinSelectivity(peLst.get(i), colStatMap));
+ }
+ Collections.sort(ndvs);
+ Collections.reverse(ndvs);
+ double ndvCrossProduct = 1.0;
+ for (int i = 0; i < ndvs.size(); i++) {
+ double n = Math.pow(ndvs.get(i), Math.pow(1 / 2.0, i));
+ ndvCrossProduct *= n;
+ }
+ return ndvCrossProduct;
}
private RexNode getCombinedPredicateForJoin(HiveJoinRel j, RexNode additionalPredicate) {
@@ -181,4 +207,5 @@ public class HiveRelMdSelectivity extend
return maxNDVSoFar;
}
+
}
Added: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdUniqueKeys.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdUniqueKeys.java?rev=1622819&view=auto
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdUniqueKeys.java (added)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdUniqueKeys.java Fri Sep 5 23:45:05 2014
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.optiq.stats;
+
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import net.hydromatic.optiq.BuiltinMethod;
+import net.hydromatic.optiq.util.BitSets;
+
+import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveTableScanRel;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.eigenbase.rel.ProjectRelBase;
+import org.eigenbase.rel.RelNode;
+import org.eigenbase.rel.metadata.BuiltInMetadata;
+import org.eigenbase.rel.metadata.Metadata;
+import org.eigenbase.rel.metadata.ReflectiveRelMetadataProvider;
+import org.eigenbase.rel.metadata.RelMdUniqueKeys;
+import org.eigenbase.rel.metadata.RelMetadataProvider;
+import org.eigenbase.rex.RexInputRef;
+import org.eigenbase.rex.RexNode;
+
+import com.google.common.base.Function;
+
+public class HiveRelMdUniqueKeys {
+
+ public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider
+ .reflectiveSource(BuiltinMethod.UNIQUE_KEYS.method,
+ new HiveRelMdUniqueKeys());
+
+ /*
+ * Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col)
+ * - min(col) = rowCount(col)
+ *
+ * Why are we intercepting ProjectRelbase and not TableScan? Because if we
+ * have a method for TableScan, it will not know which columns to check for.
+ * Inferring Uniqueness for all columns is very expensive right now. The flip
+ * side of doing this is, it only works post Field Trimming.
+ */
+ public Set<BitSet> getUniqueKeys(ProjectRelBase rel, boolean ignoreNulls) {
+
+ RelNode child = rel.getChild();
+
+ if (!(child instanceof HiveTableScanRel)) {
+ Function<RelNode, Metadata> fn = RelMdUniqueKeys.SOURCE.apply(
+ rel.getClass(), BuiltInMetadata.UniqueKeys.class);
+ return ((BuiltInMetadata.UniqueKeys) fn.apply(rel))
+ .getUniqueKeys(ignoreNulls);
+ }
+
+ HiveTableScanRel tScan = (HiveTableScanRel) child;
+ Map<Integer, Integer> posMap = new HashMap<Integer, Integer>();
+ int projectPos = 0;
+ int colStatsPos = 0;
+
+ BitSet projectedCols = new BitSet();
+ for (RexNode r : rel.getProjects()) {
+ if (r instanceof RexInputRef) {
+ projectedCols.set(((RexInputRef) r).getIndex());
+ posMap.put(colStatsPos, projectPos);
+ colStatsPos++;
+ }
+ projectPos++;
+ }
+
+ double numRows = tScan.getRows();
+ List<ColStatistics> colStats = tScan.getColStat(BitSets
+ .toList(projectedCols));
+ Set<BitSet> keys = new HashSet<BitSet>();
+
+ colStatsPos = 0;
+ for (ColStatistics cStat : colStats) {
+ boolean isKey = false;
+ if (cStat.getCountDistint() >= numRows) {
+ isKey = true;
+ }
+ if ( !isKey && cStat.getRange() != null &&
+ cStat.getRange().maxValue != null &&
+ cStat.getRange().minValue != null) {
+ double r = cStat.getRange().maxValue.doubleValue() -
+ cStat.getRange().minValue.doubleValue() + 1;
+ isKey = (numRows == r);
+ }
+ if ( isKey ) {
+ BitSet key = new BitSet();
+ key.set(posMap.get(colStatsPos));
+ keys.add(key);
+ }
+ colStatsPos++;
+ }
+
+ return keys;
+ }
+
+}
Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1622819&r1=1622818&r2=1622819&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Fri Sep 5 23:45:05 2014
@@ -11946,9 +11946,9 @@ public class SemanticAnalyzer extends Ba
if (LOG.isDebugEnabled() && !conf.getBoolVar(ConfVars.HIVE_IN_TEST)) {
LOG.debug("CBO Planning details:\n");
LOG.debug("Original Plan:\n");
- LOG.debug(RelOptUtil.toString(optiqGenPlan, SqlExplainLevel.ALL_ATTRIBUTES));
+ LOG.debug(RelOptUtil.toString(optiqGenPlan));
LOG.debug("Plan After PPD, PartPruning, ColumnPruning:\n");
- LOG.debug(RelOptUtil.toString(optiqPreCboPlan, SqlExplainLevel.ALL_ATTRIBUTES));
+ LOG.debug(RelOptUtil.toString(optiqPreCboPlan));
LOG.debug("Plan After Join Reordering:\n");
LOG.debug(RelOptUtil.toString(optiqOptimizedPlan, SqlExplainLevel.ALL_ATTRIBUTES));
}