You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2019/12/14 02:31:20 UTC

[hive] branch master updated: HIVE-22632 : Improve estimateRowSizeFromSchema (Vineet Garg via Zoltan Haindrich)

This is an automated email from the ASF dual-hosted git repository.

hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new e5f3538  HIVE-22632 : Improve estimateRowSizeFromSchema (Vineet Garg via Zoltan Haindrich)
e5f3538 is described below

commit e5f3538442838dfe6988be595ec34bc48cefe0ec
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Fri Dec 13 18:30:35 2019 -0800

    HIVE-22632 : Improve estimateRowSizeFromSchema (Vineet Garg via Zoltan Haindrich)
    
    Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
 .../stats/annotation/StatsRulesProcFactory.java    |  2 +-
 .../apache/hadoop/hive/ql/stats/StatsUtils.java    | 44 ++++++++++++----------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 5a7c1af..f5c9cbd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -2575,7 +2575,7 @@ public class StatsRulesProcFactory {
         }
       }
       if (neededColumns.size() != 0) {
-        int restColumnsDefaultSize =
+        long restColumnsDefaultSize =
             StatsUtils.estimateRowSizeFromSchema(conf, jop.getSchema().getSignature(), neededColumns);
         newDataSize = StatsUtils.safeAdd(newDataSize, StatsUtils.safeMult(restColumnsDefaultSize, newNumRows));
       }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 3f71ee4..8084dcd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -649,37 +649,41 @@ public class StatsUtils {
     return range;
   }
 
-  public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema) {
-    List<String> neededColumns = new ArrayList<>();
+  private static long getAvgColSize(final ColumnInfo columnInfo, HiveConf conf) {
+    ObjectInspector oi = columnInfo.getObjectInspector();
+    String colTypeLowerCase = columnInfo.getTypeName().toLowerCase();
+    if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
+        || colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.LIST_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.MAP_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.STRUCT_TYPE_NAME)
+        || colTypeLowerCase.startsWith(serdeConstants.UNION_TYPE_NAME)) {
+      return getAvgColLenOf(conf, oi, colTypeLowerCase);
+    } else {
+      return getAvgColLenOfFixedLengthTypes(colTypeLowerCase);
+    }
+  }
+
+  public static long estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema) {
+    long avgRowSize = 0;
     for (ColumnInfo ci : schema) {
-      neededColumns.add(ci.getInternalName());
+      avgRowSize += getAvgColSize(ci, conf);
     }
-    return estimateRowSizeFromSchema(conf, schema, neededColumns);
+    return avgRowSize;
   }
 
-  public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema,
+  public static long estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema,
       List<String> neededColumns) {
-    int avgRowSize = 0;
+    long avgRowSize = 0;
     for (String neededCol : neededColumns) {
       ColumnInfo ci = getColumnInfoForColumn(neededCol, schema);
       if (ci == null) {
         // No need to collect statistics of index columns
         continue;
       }
-      ObjectInspector oi = ci.getObjectInspector();
-      String colTypeLowerCase = ci.getTypeName().toLowerCase();
-      if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
-          || colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.LIST_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.MAP_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.STRUCT_TYPE_NAME)
-          || colTypeLowerCase.startsWith(serdeConstants.UNION_TYPE_NAME)) {
-        avgRowSize += getAvgColLenOf(conf, oi, colTypeLowerCase);
-      } else {
-        avgRowSize += getAvgColLenOfFixedLengthTypes(colTypeLowerCase);
-      }
+      avgRowSize += getAvgColSize(ci, conf);
     }
     return avgRowSize;
   }