You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2014/11/03 04:58:13 UTC
svn commit: r1636236 - in
/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql:
exec/Utilities.java optimizer/SetReducerParallelism.java
optimizer/stats/annotation/StatsRulesProcFactory.java stats/StatsUtils.java
Author: prasanthj
Date: Mon Nov 3 03:58:13 2014
New Revision: 1636236
URL: http://svn.apache.org/r1636236
Log:
HIVE-8689: handle overflows in statistics better (Sergey Shelukhin reviewed by Prasanth J)
Modified:
hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java
hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java Mon Nov 3 03:58:13 2014
@@ -3131,12 +3131,11 @@ public final class Utilities {
public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
int maxReducers, boolean powersOfTwo) {
-
- int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
+ double bytes = Math.max(totalInputFileSize, bytesPerReducer);
+ int reducers = (int) Math.ceil(bytes / bytesPerReducer);
reducers = Math.max(1, reducers);
reducers = Math.min(maxReducers, reducers);
-
int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
int reducersPowerTwo = (int)Math.pow(2, reducersLog);
@@ -3175,7 +3174,7 @@ public final class Utilities {
}
if (highestSamplePercentage >= 0) {
- totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
+ totalInputFileSize = Math.min((long) (totalInputFileSize * (highestSamplePercentage / 100D))
, totalInputFileSize);
}
return totalInputFileSize;
@@ -3199,7 +3198,7 @@ public final class Utilities {
}
if (highestSamplePercentage >= 0) {
- totalInputNumFiles = Math.min((long) (totalInputNumFiles * highestSamplePercentage / 100D)
+ totalInputNumFiles = Math.min((long) (totalInputNumFiles * (highestSamplePercentage / 100D))
, totalInputNumFiles);
}
return totalInputNumFiles;
Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java Mon Nov 3 03:58:13 2014
@@ -31,11 +31,13 @@ import org.apache.hadoop.hive.ql.exec.Ut
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory;
import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.AUTOPARALLEL;
import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.UNIFORM;
@@ -82,7 +84,8 @@ public class SetReducerParallelism imple
for (Operator<? extends OperatorDesc> sibling:
sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
- numberOfBytes += sibling.getStatistics().getDataSize();
+ numberOfBytes = StatsUtils.safeAdd(
+ numberOfBytes, sibling.getStatistics().getDataSize());
} else {
LOG.warn("No stats available from: "+sibling);
}
Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Mon Nov 3 03:58:13 2014
@@ -171,7 +171,7 @@ public class StatsRulesProcFactory {
// in case of select(*) the data size does not change
if (!sop.getConf().isSelectStar() && !sop.getConf().isSelStarNoCompute()) {
long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats);
- stats.setDataSize(StatsUtils.getMaxIfOverflow(dataSize));
+ stats.setDataSize(dataSize);
}
sop.setStatistics(stats);
@@ -323,8 +323,8 @@ public class StatsRulesProcFactory {
} else if (udf instanceof GenericUDFOPOr) {
// for OR condition independently compute and update stats
for (ExprNodeDesc child : genFunc.getChildren()) {
- newNumRows += evaluateChildExpr(stats, child, aspCtx, neededCols,
- fop);
+ newNumRows = StatsUtils.safeAdd(
+ evaluateChildExpr(stats, child, aspCtx, neededCols, fop), newNumRows);
}
} else if (udf instanceof GenericUDFOPNot) {
newNumRows = evaluateNotExpr(stats, pred, aspCtx, neededCols, fop);
@@ -678,9 +678,9 @@ public class StatsRulesProcFactory {
if (cs != null) {
long ndv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
- ndv += 1;
+ ndv = StatsUtils.safeAdd(ndv, 1);
}
- ndvProduct *= ndv;
+ ndvProduct = StatsUtils.safeMult(ndvProduct, ndv);
} else {
if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
// the column must be an aggregate column inserted by GBY. We
@@ -715,15 +715,16 @@ public class StatsRulesProcFactory {
if (mapSideHashAgg) {
if (containsGroupingSet) {
// Case 4: column stats, hash aggregation, grouping sets
- cardinality = Math.min((parentNumRows * sizeOfGroupingSet) / 2,
- ndvProduct * parallelism * sizeOfGroupingSet);
+ cardinality = Math.min(
+ (StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet)) / 2,
+ StatsUtils.safeMult(StatsUtils.safeMult(ndvProduct, parallelism), sizeOfGroupingSet));
if (isDebugEnabled) {
LOG.debug("[Case 4] STATS-" + gop.toString() + ": cardinality: " + cardinality);
}
} else {
// Case 3: column stats, hash aggregation, NO grouping sets
- cardinality = Math.min(parentNumRows / 2, ndvProduct * parallelism);
+ cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism));
if (isDebugEnabled) {
LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -732,7 +733,7 @@ public class StatsRulesProcFactory {
} else {
if (containsGroupingSet) {
// Case 6: column stats, NO hash aggregation, grouping sets
- cardinality = parentNumRows * sizeOfGroupingSet;
+ cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);
if (isDebugEnabled) {
LOG.debug("[Case 6] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -759,7 +760,7 @@ public class StatsRulesProcFactory {
if (containsGroupingSet) {
// Case 8: column stats, grouping sets
- cardinality = Math.min(parentNumRows, ndvProduct * sizeOfGroupingSet);
+ cardinality = Math.min(parentNumRows, StatsUtils.safeMult(ndvProduct, sizeOfGroupingSet));
if (isDebugEnabled) {
LOG.debug("[Case 8] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -790,7 +791,7 @@ public class StatsRulesProcFactory {
if (containsGroupingSet) {
// Case 2: NO column stats, NO hash aggregation, grouping sets
- cardinality = parentNumRows * sizeOfGroupingSet;
+ cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);
if (isDebugEnabled) {
LOG.debug("[Case 2] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -902,7 +903,7 @@ public class StatsRulesProcFactory {
long avgKeySize = 0;
for (ColStatistics cs : colStats) {
if (cs != null) {
- numEstimatedRows *= cs.getCountDistint();
+ numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, cs.getCountDistint());
avgKeySize += Math.ceil(cs.getAvgColLen());
}
}
@@ -956,7 +957,7 @@ public class StatsRulesProcFactory {
long hashEntrySize = gop.javaHashEntryOverHead + avgKeySize + avgValSize;
// estimated hash table size
- long estHashTableSize = StatsUtils.getMaxIfOverflow(numEstimatedRows * hashEntrySize);
+ long estHashTableSize = StatsUtils.safeMult(numEstimatedRows, hashEntrySize);
if (estHashTableSize < maxMemHashAgg) {
return true;
@@ -1135,7 +1136,7 @@ public class StatsRulesProcFactory {
denom = getEasedOutDenominator(distinctVals);
} else {
for (Long l : distinctVals) {
- denom *= l;
+ denom = StatsUtils.safeMult(denom, l);
}
}
} else {
@@ -1211,13 +1212,13 @@ public class StatsRulesProcFactory {
}
long maxDataSize = parentSizes.get(maxRowIdx);
- long newNumRows = (long) (joinFactor * maxRowCount * (numParents - 1));
- long newDataSize = (long) (joinFactor * maxDataSize * (numParents - 1));
+ long newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor);
+ long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor);
Statistics wcStats = new Statistics();
- wcStats.setNumRows(StatsUtils.getMaxIfOverflow(newNumRows));
- wcStats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
+ wcStats.setNumRows(newNumRows);
+ wcStats.setDataSize(newDataSize);
jop.setStatistics(wcStats);
-
+
if (isDebugEnabled) {
LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
}
@@ -1336,6 +1337,7 @@ public class StatsRulesProcFactory {
}
}
+ // No need for overflow checks, assume selectivity is always <= 1.0
float selMultiParent = 1.0f;
for(Operator<? extends OperatorDesc> parent : multiParentOp.getParentOperators()) {
// In the above example, TS-1 -> RS-1 and TS-2 -> RS-2 are simple trees
@@ -1491,7 +1493,7 @@ public class StatsRulesProcFactory {
for (int i = 0; i < rowCountParents.size(); i++) {
if (i != maxIdx) {
- result *= rowCountParents.get(i);
+ result = StatsUtils.safeMult(result, rowCountParents.get(i));
}
}
@@ -1564,7 +1566,7 @@ public class StatsRulesProcFactory {
long denom = 1;
for (int i = 0; i < distinctVals.size(); i++) {
if (i != minIdx) {
- denom *= distinctVals.get(i);
+ denom = StatsUtils.safeMult(denom, distinctVals.get(i));
}
}
return denom;
@@ -1608,12 +1610,13 @@ public class StatsRulesProcFactory {
// in the absence of column statistics, compute data size based on
// based on average row size
Statistics wcStats = parentStats.clone();
+ limit = StatsUtils.getMaxIfOverflow(limit);
if (limit <= parentStats.getNumRows()) {
long numRows = limit;
long avgRowSize = parentStats.getAvgRowSize();
- long dataSize = avgRowSize * limit;
- wcStats.setNumRows(StatsUtils.getMaxIfOverflow(numRows));
- wcStats.setDataSize(StatsUtils.getMaxIfOverflow(dataSize));
+ long dataSize = StatsUtils.safeMult(avgRowSize, limit);
+ wcStats.setNumRows(numRows);
+ wcStats.setDataSize(dataSize);
}
lop.setStatistics(wcStats);
Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon Nov 3 03:58:13 2014
@@ -20,6 +20,8 @@ package org.apache.hadoop.hive.ql.stats;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
+import com.google.common.math.DoubleMath;
+import com.google.common.math.LongMath;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -390,7 +392,7 @@ public class StatsUtils {
}
if (s <= 0 && rc > 0) {
- s = rc * avgRowSize;
+ s = safeMult(rc, avgRowSize);
dataSizes.set(i, s);
}
}
@@ -495,7 +497,7 @@ public class StatsUtils {
long result = 0;
for (Long l : vals) {
if (l > 0) {
- result += l;
+ result = safeAdd(result, l);
}
}
return result;
@@ -1259,6 +1261,7 @@ public class StatsUtils {
if (cs != null) {
String colType = cs.getColumnType();
long nonNullCount = numRows - cs.getNumNulls();
+ double sizeOf = 0;
if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
|| colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
|| colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
@@ -1266,31 +1269,25 @@ public class StatsUtils {
|| colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
|| colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
|| colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
-
- result += nonNullCount * cs.getAvgColLen();
+ sizeOf = cs.getAvgColLen();
} else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
|| colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
|| colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
-
int acl = (int) Math.round(cs.getAvgColLen());
- result += nonNullCount * JavaDataModel.get().lengthForStringOfLength(acl);
+ sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
} else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
-
int acl = (int) Math.round(cs.getAvgColLen());
- result += nonNullCount * JavaDataModel.get().lengthForByteArrayOfSize(acl);
+ sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
} else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
-
- result += nonNullCount * JavaDataModel.get().lengthOfTimestamp();
+ sizeOf = JavaDataModel.get().lengthOfTimestamp();
} else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
-
- result += nonNullCount * JavaDataModel.get().lengthOfDecimal();
+ sizeOf = JavaDataModel.get().lengthOfDecimal();
} else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
-
- result += nonNullCount * JavaDataModel.get().lengthOfDate();
+ sizeOf = JavaDataModel.get().lengthOfDate();
} else {
-
- result += nonNullCount * cs.getAvgColLen();
+ sizeOf = cs.getAvgColLen();
}
+ result = safeAdd(result, safeMult(nonNullCount, sizeOf));
}
}
@@ -1436,4 +1433,28 @@ public class StatsUtils {
public static long getMaxIfOverflow(long val) {
return val < 0 ? Long.MAX_VALUE : val;
}
+
+ /** Bounded multiplication - overflows become MAX_VALUE */
+ public static long safeMult(long a, double b) {
+ double result = a * b;
+ return (result > Long.MAX_VALUE) ? Long.MAX_VALUE : (long)result;
+ }
+
+ /** Bounded addition - overflows become MAX_VALUE */
+ public static long safeAdd(long a, long b) {
+ try {
+ return LongMath.checkedAdd(a, b);
+ } catch (ArithmeticException ex) {
+ return Long.MAX_VALUE;
+ }
+ }
+
+ /** Bounded multiplication - overflows become MAX_VALUE */
+ public static long safeMult(long a, long b) {
+ try {
+ return LongMath.checkedMultiply(a, b);
+ } catch (ArithmeticException ex) {
+ return Long.MAX_VALUE;
+ }
+ }
}