You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2014/08/22 17:30:33 UTC
svn commit: r1619839 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
test/results/clientpositive/annotate_stats_join.q.out
Author: hashutosh
Date: Fri Aug 22 15:30:33 2014
New Revision: 1619839
URL: http://svn.apache.org/r1619839
Log:
HIVE-7836 : Ease-out denominator for multi-attribute join case in statistics annotation (Prasanth J via Ashutosh Chauhan)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1619839&r1=1619838&r2=1619839&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Fri Aug 22 15:30:33 2014
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.optimi
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
@@ -811,6 +812,7 @@ public class StatsRulesProcFactory {
// 2 relations, multiple attributes
boolean multiAttr = false;
int numAttr = 1;
+ int numParent = parents.size();
Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
Map<Integer, List<String>> joinKeys = Maps.newHashMap();
@@ -873,12 +875,19 @@ public class StatsRulesProcFactory {
perAttrDVs.add(cs.getCountDistint());
}
}
+
distinctVals.add(getDenominator(perAttrDVs));
perAttrDVs.clear();
}
- for (Long l : distinctVals) {
- denom *= l;
+ if (numAttr > numParent) {
+ // To avoid denominator getting larger and aggressively reducing
+ // number of rows, we will ease out denominator.
+ denom = getEasedOutDenominator(distinctVals);
+ } else {
+ for (Long l : distinctVals) {
+ denom *= l;
+ }
}
} else {
for (List<String> jkeys : joinKeys.values()) {
@@ -983,6 +992,20 @@ public class StatsRulesProcFactory {
return null;
}
+ private Long getEasedOutDenominator(List<Long> distinctVals) {
+ // Exponential back-off for NDVs.
+ // 1) Descending order sort of NDVs
+ // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
+ Collections.sort(distinctVals, Collections.reverseOrder());
+
+ long denom = distinctVals.get(0);
+ for (int i = 1; i < distinctVals.size(); i++) {
+ denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
+ }
+
+ return denom;
+ }
+
private void updateStatsForJoinType(Statistics stats, long newNumRows,
JoinDesc conf, Map<String, Long> rowCountParents,
Map<String, String> outInTabAlias) {
Modified: hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out?rev=1619839&r1=1619838&r2=1619839&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out Fri Aug 22 15:30:33 2014
@@ -391,17 +391,17 @@ STAGE PLANS:
0 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0}
1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
outputColumnNames: _col0, _col1, _col2, _col6, _col7
- Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (((_col1 = _col6) and (_col0 = _col7)) and (_col7 = _col0)) (type: boolean)
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col6 (type: int), _col7 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat