You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2014/11/03 04:58:13 UTC

svn commit: r1636236 - in /hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql: exec/Utilities.java optimizer/SetReducerParallelism.java optimizer/stats/annotation/StatsRulesProcFactory.java stats/StatsUtils.java

Author: prasanthj
Date: Mon Nov  3 03:58:13 2014
New Revision: 1636236

URL: http://svn.apache.org/r1636236
Log:
HIVE-8689: handle overflows in statistics better (Sergey Shelukhin reviewed by Prasanth J)

Modified:
    hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
    hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java
    hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
    hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java

Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java Mon Nov  3 03:58:13 2014
@@ -3131,12 +3131,11 @@ public final class Utilities {
 
   public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
       int maxReducers, boolean powersOfTwo) {
-
-    int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
+    double bytes = Math.max(totalInputFileSize, bytesPerReducer);
+    int reducers = (int) Math.ceil(bytes / bytesPerReducer);
     reducers = Math.max(1, reducers);
     reducers = Math.min(maxReducers, reducers);
 
-
     int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
     int reducersPowerTwo = (int)Math.pow(2, reducersLog);
 
@@ -3175,7 +3174,7 @@ public final class Utilities {
     }
 
     if (highestSamplePercentage >= 0) {
-      totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
+      totalInputFileSize = Math.min((long) (totalInputFileSize * (highestSamplePercentage / 100D))
           , totalInputFileSize);
     }
     return totalInputFileSize;
@@ -3199,7 +3198,7 @@ public final class Utilities {
     }
 
     if (highestSamplePercentage >= 0) {
-      totalInputNumFiles = Math.min((long) (totalInputNumFiles * highestSamplePercentage / 100D)
+      totalInputNumFiles = Math.min((long) (totalInputNumFiles * (highestSamplePercentage / 100D))
           , totalInputNumFiles);
     }
     return totalInputNumFiles;

Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetReducerParallelism.java Mon Nov  3 03:58:13 2014
@@ -31,11 +31,13 @@ import org.apache.hadoop.hive.ql.exec.Ut
 import org.apache.hadoop.hive.ql.lib.Node;
 import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory;
 import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
 
 import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.AUTOPARALLEL;
 import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.UNIFORM;
@@ -82,7 +84,8 @@ public class SetReducerParallelism imple
         for (Operator<? extends OperatorDesc> sibling:
           sink.getChildOperators().get(0).getParentOperators()) {
           if (sibling.getStatistics() != null) {
-            numberOfBytes += sibling.getStatistics().getDataSize();
+            numberOfBytes = StatsUtils.safeAdd(
+                numberOfBytes, sibling.getStatistics().getDataSize());
           } else {
             LOG.warn("No stats available from: "+sibling);
           }

Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Mon Nov  3 03:58:13 2014
@@ -171,7 +171,7 @@ public class StatsRulesProcFactory {
           // in case of select(*) the data size does not change
           if (!sop.getConf().isSelectStar() && !sop.getConf().isSelStarNoCompute()) {
             long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats);
-            stats.setDataSize(StatsUtils.getMaxIfOverflow(dataSize));
+            stats.setDataSize(dataSize);
           }
           sop.setStatistics(stats);
 
@@ -323,8 +323,8 @@ public class StatsRulesProcFactory {
         } else if (udf instanceof GenericUDFOPOr) {
           // for OR condition independently compute and update stats
           for (ExprNodeDesc child : genFunc.getChildren()) {
-            newNumRows += evaluateChildExpr(stats, child, aspCtx, neededCols,
-                fop);
+            newNumRows = StatsUtils.safeAdd(
+                evaluateChildExpr(stats, child, aspCtx, neededCols, fop), newNumRows);
           }
         } else if (udf instanceof GenericUDFOPNot) {
           newNumRows = evaluateNotExpr(stats, pred, aspCtx, neededCols, fop);
@@ -678,9 +678,9 @@ public class StatsRulesProcFactory {
             if (cs != null) {
               long ndv = cs.getCountDistint();
               if (cs.getNumNulls() > 0) {
-                ndv += 1;
+                ndv = StatsUtils.safeAdd(ndv, 1);
               }
-              ndvProduct *= ndv;
+              ndvProduct = StatsUtils.safeMult(ndvProduct, ndv);
             } else {
               if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
                 // the column must be an aggregate column inserted by GBY. We
@@ -715,15 +715,16 @@ public class StatsRulesProcFactory {
             if (mapSideHashAgg) {
               if (containsGroupingSet) {
                 // Case 4: column stats, hash aggregation, grouping sets
-                cardinality = Math.min((parentNumRows * sizeOfGroupingSet) / 2,
-                    ndvProduct * parallelism * sizeOfGroupingSet);
+                cardinality = Math.min(
+                    (StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet)) / 2,
+                    StatsUtils.safeMult(StatsUtils.safeMult(ndvProduct, parallelism), sizeOfGroupingSet));
 
                 if (isDebugEnabled) {
                   LOG.debug("[Case 4] STATS-" + gop.toString() + ": cardinality: " + cardinality);
                 }
               } else {
                 // Case 3: column stats, hash aggregation, NO grouping sets
-                cardinality = Math.min(parentNumRows / 2, ndvProduct * parallelism);
+                cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism));
 
                 if (isDebugEnabled) {
                   LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -732,7 +733,7 @@ public class StatsRulesProcFactory {
             } else {
               if (containsGroupingSet) {
                 // Case 6: column stats, NO hash aggregation, grouping sets
-                cardinality = parentNumRows * sizeOfGroupingSet;
+                cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);
 
                 if (isDebugEnabled) {
                   LOG.debug("[Case 6] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -759,7 +760,7 @@ public class StatsRulesProcFactory {
 
             if (containsGroupingSet) {
               // Case 8: column stats, grouping sets
-              cardinality = Math.min(parentNumRows, ndvProduct * sizeOfGroupingSet);
+              cardinality = Math.min(parentNumRows, StatsUtils.safeMult(ndvProduct, sizeOfGroupingSet));
 
               if (isDebugEnabled) {
                 LOG.debug("[Case 8] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -790,7 +791,7 @@ public class StatsRulesProcFactory {
 
               if (containsGroupingSet) {
                 // Case 2: NO column stats, NO hash aggregation, grouping sets
-                cardinality = parentNumRows * sizeOfGroupingSet;
+                cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);
 
                 if (isDebugEnabled) {
                   LOG.debug("[Case 2] STATS-" + gop.toString() + ": cardinality: " + cardinality);
@@ -902,7 +903,7 @@ public class StatsRulesProcFactory {
         long avgKeySize = 0;
         for (ColStatistics cs : colStats) {
           if (cs != null) {
-            numEstimatedRows *= cs.getCountDistint();
+            numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, cs.getCountDistint());
             avgKeySize += Math.ceil(cs.getAvgColLen());
           }
         }
@@ -956,7 +957,7 @@ public class StatsRulesProcFactory {
         long hashEntrySize = gop.javaHashEntryOverHead + avgKeySize + avgValSize;
 
         // estimated hash table size
-        long estHashTableSize = StatsUtils.getMaxIfOverflow(numEstimatedRows * hashEntrySize);
+        long estHashTableSize = StatsUtils.safeMult(numEstimatedRows, hashEntrySize);
 
         if (estHashTableSize < maxMemHashAgg) {
           return true;
@@ -1135,7 +1136,7 @@ public class StatsRulesProcFactory {
               denom = getEasedOutDenominator(distinctVals);
             } else {
               for (Long l : distinctVals) {
-                denom *= l;
+                denom = StatsUtils.safeMult(denom, l);
               }
             }
           } else {
@@ -1211,13 +1212,13 @@ public class StatsRulesProcFactory {
           }
 
           long maxDataSize = parentSizes.get(maxRowIdx);
-          long newNumRows = (long) (joinFactor * maxRowCount * (numParents - 1));
-          long newDataSize = (long) (joinFactor * maxDataSize * (numParents - 1));
+          long newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor);
+          long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor);
           Statistics wcStats = new Statistics();
-          wcStats.setNumRows(StatsUtils.getMaxIfOverflow(newNumRows));
-          wcStats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
+          wcStats.setNumRows(newNumRows);
+          wcStats.setDataSize(newDataSize);
           jop.setStatistics(wcStats);
-
+ 
           if (isDebugEnabled) {
             LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
           }
@@ -1336,6 +1337,7 @@ public class StatsRulesProcFactory {
         }
       }
 
+      // No need for overflow checks, assume selectivity is always <= 1.0
       float selMultiParent = 1.0f;
       for(Operator<? extends OperatorDesc> parent : multiParentOp.getParentOperators()) {
         // In the above example, TS-1 -> RS-1 and TS-2 -> RS-2 are simple trees
@@ -1491,7 +1493,7 @@ public class StatsRulesProcFactory {
 
       for (int i = 0; i < rowCountParents.size(); i++) {
         if (i != maxIdx) {
-          result *= rowCountParents.get(i);
+          result = StatsUtils.safeMult(result, rowCountParents.get(i));
         }
       }
 
@@ -1564,7 +1566,7 @@ public class StatsRulesProcFactory {
         long denom = 1;
         for (int i = 0; i < distinctVals.size(); i++) {
           if (i != minIdx) {
-            denom *= distinctVals.get(i);
+            denom = StatsUtils.safeMult(denom, distinctVals.get(i));
           }
         }
         return denom;
@@ -1608,12 +1610,13 @@ public class StatsRulesProcFactory {
             // in the absence of column statistics, compute data size based on
             // based on average row size
             Statistics wcStats = parentStats.clone();
+            limit = StatsUtils.getMaxIfOverflow(limit);
             if (limit <= parentStats.getNumRows()) {
               long numRows = limit;
               long avgRowSize = parentStats.getAvgRowSize();
-              long dataSize = avgRowSize * limit;
-              wcStats.setNumRows(StatsUtils.getMaxIfOverflow(numRows));
-              wcStats.setDataSize(StatsUtils.getMaxIfOverflow(dataSize));
+              long dataSize = StatsUtils.safeMult(avgRowSize, limit);
+              wcStats.setNumRows(numRows);
+              wcStats.setDataSize(dataSize);
             }
             lop.setStatistics(wcStats);
 

Modified: hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: http://svn.apache.org/viewvc/hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1636236&r1=1636235&r2=1636236&view=diff
==============================================================================
--- hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (original)
+++ hive/branches/branch-0.14/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon Nov  3 03:58:13 2014
@@ -20,6 +20,8 @@ package org.apache.hadoop.hive.ql.stats;
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
+import com.google.common.math.DoubleMath;
+import com.google.common.math.LongMath;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -390,7 +392,7 @@ public class StatsUtils {
       }
 
       if (s <= 0 && rc > 0) {
-        s = rc * avgRowSize;
+        s = safeMult(rc, avgRowSize);
         dataSizes.set(i, s);
       }
     }
@@ -495,7 +497,7 @@ public class StatsUtils {
     long result = 0;
     for (Long l : vals) {
       if (l > 0) {
-        result += l;
+        result = safeAdd(result, l);
       }
     }
     return result;
@@ -1259,6 +1261,7 @@ public class StatsUtils {
       if (cs != null) {
         String colType = cs.getColumnType();
         long nonNullCount = numRows - cs.getNumNulls();
+        double sizeOf = 0;
         if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
             || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
             || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
@@ -1266,31 +1269,25 @@ public class StatsUtils {
             || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
             || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
             || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
-
-          result += nonNullCount * cs.getAvgColLen();
+          sizeOf = cs.getAvgColLen();
         } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
             || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
             || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
-
           int acl = (int) Math.round(cs.getAvgColLen());
-          result += nonNullCount * JavaDataModel.get().lengthForStringOfLength(acl);
+          sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
         } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
-
           int acl = (int) Math.round(cs.getAvgColLen());
-          result += nonNullCount * JavaDataModel.get().lengthForByteArrayOfSize(acl);
+          sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
         } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
-
-          result += nonNullCount * JavaDataModel.get().lengthOfTimestamp();
+          sizeOf = JavaDataModel.get().lengthOfTimestamp();
         } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
-
-          result += nonNullCount * JavaDataModel.get().lengthOfDecimal();
+          sizeOf = JavaDataModel.get().lengthOfDecimal();
         } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
-
-          result += nonNullCount * JavaDataModel.get().lengthOfDate();
+          sizeOf = JavaDataModel.get().lengthOfDate();
         } else {
-
-          result += nonNullCount * cs.getAvgColLen();
+          sizeOf = cs.getAvgColLen();
         }
+        result = safeAdd(result, safeMult(nonNullCount, sizeOf));
       }
     }
 
@@ -1436,4 +1433,28 @@ public class StatsUtils {
   public static long getMaxIfOverflow(long val) {
     return val < 0 ? Long.MAX_VALUE : val;
   }
+
+  /** Bounded multiplication - overflows become MAX_VALUE */
+  public static long safeMult(long a, double b) {
+    double result = a * b;
+    return (result > Long.MAX_VALUE) ? Long.MAX_VALUE : (long)result;
+  }
+ 
+  /** Bounded addition - overflows become MAX_VALUE */
+  public static long safeAdd(long a, long b) {
+    try {
+      return LongMath.checkedAdd(a, b);
+    } catch (ArithmeticException ex) {
+      return Long.MAX_VALUE;
+    }
+  }
+ 
+  /** Bounded multiplication - overflows become MAX_VALUE */
+  public static long safeMult(long a, long b) {
+    try {
+      return LongMath.checkedMultiply(a, b);
+    } catch (ArithmeticException ex) {
+      return Long.MAX_VALUE;
+    }
+  }
 }