You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2014/08/19 19:18:03 UTC
svn commit: r1618904 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
test/results/clientpositive/union20.q.out
Author: prasanthj
Date: Tue Aug 19 17:18:03 2014
New Revision: 1618904
URL: http://svn.apache.org/r1618904
Log:
HIVE-7734: Join stats annotation rule is not updating columns statistics correctly (Prasanth J, reviewed by Gunther Hagleitner)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
hive/trunk/ql/src/test/results/clientpositive/union20.q.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1618904&r1=1618903&r2=1618904&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Tue Aug 19 17:18:03 2014
@@ -67,8 +67,10 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.serde.serdeConstants;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
public class StatsRulesProcFactory {
@@ -803,7 +805,7 @@ public class StatsRulesProcFactory {
// statistics object that is combination of statistics from all
// relations involved in JOIN
Statistics stats = new Statistics();
- List<Long> rowCountParents = Lists.newArrayList();
+ Map<String, Long> rowCountParents = new HashMap<String, Long>();
List<Long> distinctVals = Lists.newArrayList();
// 2 relations, multiple attributes
@@ -818,9 +820,20 @@ public class StatsRulesProcFactory {
ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
Statistics parentStats = parent.getStatistics();
- rowCountParents.add(parentStats.getNumRows());
List<ExprNodeDesc> keyExprs = parent.getConf().getKeyCols();
+ // Parent RS may have column statistics from multiple parents.
+ // Populate table alias to row count map, this will be used later to
+ // scale down/up column statistics based on new row count
+ // NOTE: JOIN with UNION as parent of RS will not have table alias
+ // propagated properly. UNION operator does not propagate the table
+ // alias of subqueries properly to expression nodes. Hence union20.q
+ // will have wrong number of rows.
+ Set<String> tableAliases = StatsUtils.getAllTableAlias(parent.getColumnExprMap());
+ for (String tabAlias : tableAliases) {
+ rowCountParents.put(tabAlias, parentStats.getNumRows());
+ }
+
// multi-attribute join key
if (keyExprs.size() > 1) {
multiAttr = true;
@@ -890,6 +903,7 @@ public class StatsRulesProcFactory {
Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
RowSchema rs = jop.getSchema();
List<ColStatistics> outColStats = Lists.newArrayList();
+ Map<String, String> outInTabAlias = new HashMap<String, String>();
for (ColumnInfo ci : rs.getSignature()) {
String key = ci.getInternalName();
ExprNodeDesc end = colExprMap.get(key);
@@ -901,6 +915,7 @@ public class StatsRulesProcFactory {
ColStatistics cs = joinedColStats.get(fqColName);
String outColName = key;
String outTabAlias = ci.getTabAlias();
+ outInTabAlias.put(outTabAlias, tabAlias);
if (cs != null) {
cs.setColumnName(outColName);
cs.setTableAlias(outTabAlias);
@@ -911,7 +926,8 @@ public class StatsRulesProcFactory {
// update join statistics
stats.setColumnStats(outColStats);
- long newRowCount = computeNewRowCount(rowCountParents, denom);
+ long newRowCount = computeNewRowCount(
+ Lists.newArrayList(rowCountParents.values()), denom);
if (newRowCount <= 0 && LOG.isDebugEnabled()) {
newRowCount = 0;
@@ -920,7 +936,8 @@ public class StatsRulesProcFactory {
+ " #Rows of parents: " + rowCountParents.toString() + ". Denominator: " + denom);
}
- updateStatsForJoinType(stats, newRowCount, true, jop.getConf());
+ updateStatsForJoinType(stats, newRowCount, jop.getConf(),
+ rowCountParents, outInTabAlias);
jop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -967,36 +984,39 @@ public class StatsRulesProcFactory {
}
private void updateStatsForJoinType(Statistics stats, long newNumRows,
- boolean useColStats, JoinDesc conf) {
- long oldRowCount = stats.getNumRows();
- double ratio = (double) newNumRows / (double) oldRowCount;
+ JoinDesc conf, Map<String, Long> rowCountParents,
+ Map<String, String> outInTabAlias) {
stats.setNumRows(newNumRows);
- if (useColStats) {
- List<ColStatistics> colStats = stats.getColumnStats();
- for (ColStatistics cs : colStats) {
- long oldDV = cs.getCountDistint();
- long newDV = oldDV;
-
- // if ratio is greater than 1, then number of rows increases. This can happen
- // when some operators like GROUPBY duplicates the input rows in which case
- // number of distincts should not change. Update the distinct count only when
- // the output number of rows is less than input number of rows.
- if (ratio <= 1.0) {
- newDV = (long) Math.ceil(ratio * oldDV);
- }
- // Assumes inner join
- // TODO: HIVE-5579 will handle different join types
- cs.setNumNulls(0);
- cs.setCountDistint(newDV);
- }
- stats.setColumnStats(colStats);
- long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
- stats.setDataSize(newDataSize);
- } else {
- long newDataSize = (long) (ratio * stats.getDataSize());
- stats.setDataSize(newDataSize);
+ // scale down/up the column statistics based on the changes in number of
+ // rows from each parent. For ex: If there are 2 parents for JOIN operator
+ // with 1st parent having 200 rows and 2nd parent having 2000 rows. Now if
+ // the new number of rows after applying join rule is 10, then the column
+ // stats for columns from 1st parent should be scaled down by 200/10 = 20x
+ // and stats for columns from 2nd parent should be scaled down by 200x
+ List<ColStatistics> colStats = stats.getColumnStats();
+ for (ColStatistics cs : colStats) {
+ long oldRowCount = rowCountParents.get(outInTabAlias.get(cs.getTableAlias()));
+ double ratio = (double) newNumRows / (double) oldRowCount;
+ long oldDV = cs.getCountDistint();
+ long newDV = oldDV;
+
+ // if ratio is greater than 1, then number of rows increases. This can happen
+ // when some operators like GROUPBY duplicates the input rows in which case
+ // number of distincts should not change. Update the distinct count only when
+ // the output number of rows is less than input number of rows.
+ if (ratio <= 1.0) {
+ newDV = (long) Math.ceil(ratio * oldDV);
+ }
+ // Assumes inner join
+ // TODO: HIVE-5579 will handle different join types
+ cs.setNumNulls(0);
+ cs.setCountDistint(newDV);
}
+ stats.setColumnStats(colStats);
+ long newDataSize = StatsUtils
+ .getDataSizeFromColumnStats(newNumRows, colStats);
+ stats.setDataSize(newDataSize);
}
private long computeNewRowCount(List<Long> rowCountParents, long denom) {
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1618904&r1=1618903&r2=1618904&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Tue Aug 19 17:18:03 2014
@@ -18,11 +18,8 @@
package org.apache.hadoop.hive.ql.stats;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
@@ -79,8 +76,12 @@ import org.apache.hadoop.hive.serde2.obj
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
public class StatsUtils {
@@ -1216,4 +1217,33 @@ public class StatsUtils {
}
return result;
}
+
+ /**
+ * Returns all table aliases from expression nodes
+ * @param columnExprMap - column expression map
+ * @return
+ */
+ public static Set<String> getAllTableAlias(
+ Map<String, ExprNodeDesc> columnExprMap) {
+ Set<String> result = new HashSet<String>();
+ if (columnExprMap != null) {
+ for (ExprNodeDesc end : columnExprMap.values()) {
+ getTableAliasFromExprNode(end, result);
+ }
+ }
+ return result;
+ }
+
+ private static void getTableAliasFromExprNode(ExprNodeDesc end,
+ Set<String> output) {
+
+ if (end instanceof ExprNodeColumnDesc) {
+ output.add(((ExprNodeColumnDesc) end).getTabAlias());
+ } else if (end instanceof ExprNodeGenericFuncDesc) {
+ for (ExprNodeDesc child : end.getChildren()) {
+ getTableAliasFromExprNode(child, output);
+ }
+ }
+
+ }
}
Modified: hive/trunk/ql/src/test/results/clientpositive/union20.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/union20.q.out?rev=1618904&r1=1618903&r2=1618904&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/union20.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/union20.q.out Tue Aug 19 17:18:03 2014
@@ -130,14 +130,14 @@ STAGE PLANS:
0 {KEY.reducesinkkey0} {VALUE._col0}
1 {KEY.reducesinkkey0} {VALUE._col0}
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 36 Data size: 9792 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 1632 Basic stats: COMPLETE Column stats: PARTIAL
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 36 Data size: 9792 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 1632 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
- Statistics: Num rows: 36 Data size: 9792 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 6 Data size: 1632 Basic stats: COMPLETE Column stats: PARTIAL
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat