You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2014/04/28 17:40:32 UTC
svn commit: r1590681 [1/3] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/common/ metastore/src/java/org/apache/hadoop/hive/metastore/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientn...

Author: hashutosh
Date: Mon Apr 28 15:40:31 2014
New Revision: 1590681

URL: http://svn.apache.org/r1590681
Log:
HIVE-6979 : Hadoop-2 test failures related to quick stats not being populated correctly (Prasanth J via Ashutosh Chauhan)

Added:
    hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q
    hive/trunk/ql/src/test/results/clientpositive/union_remove_25.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
    hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java
    hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
    hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out
    hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out
    hive/trunk/ql/src/test/results/clientpositive/filter_numeric.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/infer_bucket_sort_list_bucket.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_6.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_7.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_8.q.out
    hive/trunk/ql/src/test/results/clientpositive/mapjoin_test_outer.q.out
    hive/trunk/ql/src/test/results/clientpositive/nullformatCTAS.q.out
    hive/trunk/ql/src/test/results/clientpositive/nullgroup3.q.out
    hive/trunk/ql/src/test/results/clientpositive/orc_createas1.q.out
    hive/trunk/ql/src/test/results/clientpositive/ppd_join4.q.out
    hive/trunk/ql/src/test/results/clientpositive/select_dummy_source.q.out
    hive/trunk/ql/src/test/results/clientpositive/show_create_table_alter.q.out
    hive/trunk/ql/src/test/results/clientpositive/show_tblproperties.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats_list_bucket.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats_partscan_1_23.q.out
    hive/trunk/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
    hive/trunk/ql/src/test/results/clientpositive/truncate_column_list_bucket.q.out
    hive/trunk/ql/src/test/results/clientpositive/udf_current_database.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_10.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_12.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_13.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_14.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_19.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_20.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_21.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_22.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_23.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_24.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_4.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_5.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_7.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_8.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_remove_9.q.out
    hive/trunk/ql/src/test/results/clientpositive/unset_table_view_property.q.out

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java Mon Apr 28 15:40:31 2014
@@ -18,6 +18,8 @@
 package org.apache.hadoop.hive.common;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -46,6 +48,22 @@ public class HiveStatsUtils {
   public static FileStatus[] getFileStatusRecurse(Path path, int level, FileSystem fs)
       throws IOException {
 
+    // if level is <0, the return all files/directories under the specified path
+    if ( level < 0) {
+      List<FileStatus> result = new ArrayList<FileStatus>();
+      try {
+        FileStatus fileStatus = fs.getFileStatus(path);
+        FileUtils.listStatusRecursively(fs, fileStatus, result);
+      } catch (IOException e) {
+        // globStatus() API returns empty FileStatus[] when the specified path
+        // does not exist. But getFileStatus() throw IOException. To mimic the
+        // similar behavior we will return empty array on exception. For external
+        // tables, the path of the table will not exists during table creation
+        return new FileStatus[0];
+      }
+      return result.toArray(new FileStatus[result.size()]);
+    }
+
     // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
     StringBuilder sb = new StringBuilder(path.toUri().getPath());
     for (int i = 0; i < level; i++) {

Modified: hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java (original)
+++ hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java Mon Apr 28 15:40:31 2014
@@ -189,13 +189,8 @@ public class MetaStoreUtils {
         // Let's try to populate those stats that don't require full scan.
         LOG.info("Updating table stats fast for " + tbl.getTableName());
         FileStatus[] fileStatus = wh.getFileStatusesForUnpartitionedTable(db, tbl);
-        params.put(StatsSetupConst.NUM_FILES, Integer.toString(fileStatus.length));
-        long tableSize = 0L;
-        for (FileStatus status : fileStatus) {
-          tableSize += status.getLen();
-        }
-        params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tableSize));
-        LOG.info("Updated size of table " + tbl.getTableName() +" to "+ Long.toString(tableSize));
+        populateQuickStats(fileStatus, params);
+        LOG.info("Updated size of table " + tbl.getTableName() +" to "+ params.get(StatsSetupConst.TOTAL_SIZE));
         if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) {
           // invalidate stats requiring scan since this is a regular ddl alter case
           for (String stat : StatsSetupConst.statsRequireCompute) {
@@ -213,6 +208,20 @@ public class MetaStoreUtils {
     return updated;
   }
 
+  public static void populateQuickStats(FileStatus[] fileStatus, Map<String, String> params) {
+    int numFiles = 0;
+    long tableSize = 0L;
+    for (FileStatus status : fileStatus) {
+      // don't take directories into account for quick stats
+      if (!status.isDir()) {
+        tableSize += status.getLen();
+        numFiles += 1;
+      }
+    }
+    params.put(StatsSetupConst.NUM_FILES, Integer.toString(numFiles));
+    params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tableSize));
+  }
+
   // check if stats need to be (re)calculated
   public static boolean requireCalStats(Configuration hiveConf, Partition oldPart,
     Partition newPart, Table tbl) {
@@ -285,13 +294,8 @@ public class MetaStoreUtils {
         // populate those statistics that don't require a full scan of the data.
         LOG.warn("Updating partition stats fast for: " + part.getTableName());
         FileStatus[] fileStatus = wh.getFileStatusesForSD(part.getSd());
-        params.put(StatsSetupConst.NUM_FILES, Integer.toString(fileStatus.length));
-        long partSize = 0L;
-        for (int i = 0; i < fileStatus.length; i++) {
-          partSize += fileStatus[i].getLen();
-        }
-        params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(partSize));
-        LOG.warn("Updated size to " + Long.toString(partSize));
+        populateQuickStats(fileStatus, params);
+        LOG.warn("Updated size to " + params.get(StatsSetupConst.TOTAL_SIZE));
         if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) {
           // invalidate stats requiring scan since this is a regular ddl alter case
           for (String stat : StatsSetupConst.statsRequireCompute) {

Modified: hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java
URL: http://svn.apache.org/viewvc/hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java (original)
+++ hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java Mon Apr 28 15:40:31 2014
@@ -496,9 +496,7 @@ public class Warehouse {
     try {
       Path path = new Path(desc.getLocation());
       FileSystem fileSys = path.getFileSystem(conf);
-      /* consider sub-directory created from list bucketing. */
-      int listBucketingDepth = calculateListBucketingDMLDepth(desc);
-      return HiveStatsUtils.getFileStatusRecurse(path, (1 + listBucketingDepth), fileSys);
+      return HiveStatsUtils.getFileStatusRecurse(path, -1, fileSys);
     } catch (IOException ioe) {
       MetaStoreUtils.logAndThrowMetaException(ioe);
     }
@@ -506,28 +504,6 @@ public class Warehouse {
   }
 
   /**
-   * List bucketing will introduce sub-directories.
-   * calculate it here in order to go to the leaf directory
-   * so that we can count right number of files.
-   * @param desc
-   * @return
-   */
-  private static int calculateListBucketingDMLDepth(StorageDescriptor desc) {
-    // list bucketing will introduce more files
-    int listBucketingDepth = 0;
-    SkewedInfo skewedInfo = desc.getSkewedInfo();
-    if ((skewedInfo != null) && (skewedInfo.getSkewedColNames() != null)
-        && (skewedInfo.getSkewedColNames().size() > 0)
-        && (skewedInfo.getSkewedColValues() != null)
-        && (skewedInfo.getSkewedColValues().size() > 0)
-        && (skewedInfo.getSkewedColValueLocationMaps() != null)
-        && (skewedInfo.getSkewedColValueLocationMaps().size() > 0)) {
-      listBucketingDepth = skewedInfo.getSkewedColNames().size();
-    }
-    return listBucketingDepth;
-  }
-
-  /**
    * @param table
    * @return array of FileStatus objects corresponding to the files making up the passed
    * unpartitioned table
@@ -537,7 +513,7 @@ public class Warehouse {
     Path tablePath = getTablePath(db, table.getTableName());
     try {
       FileSystem fileSys = tablePath.getFileSystem(conf);
-      return HiveStatsUtils.getFileStatusRecurse(tablePath, 1, fileSys);
+      return HiveStatsUtils.getFileStatusRecurse(tablePath, -1, fileSys);
     } catch (IOException ioe) {
       MetaStoreUtils.logAndThrowMetaException(ioe);
     }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java Mon Apr 28 15:40:31 2014
@@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
 import org.apache.hadoop.hive.metastore.Warehouse;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
@@ -326,12 +327,7 @@ public class StatsTask extends Task<Stat
      * calculate fast statistics
      */
     FileStatus[] partfileStatus = wh.getFileStatusesForSD(desc);
-    parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(partfileStatus.length));
-    long partSize = 0L;
-    for (int i = 0; i < partfileStatus.length; i++) {
-      partSize += partfileStatus[i].getLen();
-    }
-    parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(partSize));
+    MetaStoreUtils.populateQuickStats(partfileStatus, parameters);
   }
 
   private void clearStats(Map<String, String> parameters) {

Added: hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q?rev=1590681&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q Mon Apr 28 15:40:31 2014
@@ -0,0 +1,86 @@
+set hive.stats.autogather=false;
+set hive.optimize.union.remove=true;
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set mapred.input.dir.recursive=true;
+
+-- This is to test the union->selectstar->filesink optimization
+-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink
+-- There is no need to write the temporary results of the sub-queries, and then read them 
+-- again to process the union. The union can be removed completely.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- off
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile;
+create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile;
+create table outputTbl2(key string, values bigint) partitioned by (ds string) stored as textfile;
+create table outputTbl3(key string, values bigint) partitioned by (ds string,hr string) stored as textfile;
+
+load data local inpath '../../data/files/T1.txt' into table inputTbl1;
+
+explain
+insert overwrite table outputTbl1 partition(ds='2004')
+SELECT *
+FROM (
+  SELECT key, count(1) as values from inputTbl1 group by key
+  UNION ALL
+  SELECT key, count(1) as values from inputTbl1 group by key
+) a;
+
+insert overwrite table outputTbl1 partition(ds='2004')
+SELECT *
+FROM (
+  SELECT key, count(1) as values from inputTbl1 group by key
+  UNION ALL
+  SELECT key, count(1) as values from inputTbl1 group by key
+) a;
+
+desc formatted outputTbl1 partition(ds='2004');
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+select * from outputTbl1 order by key, values;
+
+explain 
+insert overwrite table outputTbl2 partition(ds)
+SELECT *
+FROM (
+  SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+  UNION ALL
+  SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+) a;
+
+insert overwrite table outputTbl2 partition(ds)
+SELECT *
+FROM (
+  SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+  UNION ALL
+  SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+) a;
+
+show partitions outputTbl2;
+desc formatted outputTbl2 partition(ds='2008-04-08');
+
+explain insert overwrite table outputTbl3 partition(ds, hr)
+SELECT *
+FROM (
+  SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+  UNION ALL
+  SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+) a;
+
+insert overwrite table outputTbl3 partition(ds, hr)
+SELECT *
+FROM (
+  SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+  UNION ALL
+  SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+) a;
+
+show partitions outputTbl3;
+desc formatted outputTbl3 partition(ds='2008-04-08', hr='11');

Modified: hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out (original)
+++ hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out Mon Apr 28 15:40:31 2014
@@ -79,7 +79,7 @@ Partition Parameters:	 	 
 	numFiles            	1                   
 	numRows             	-1                  
 	rawDataSize         	-1                  
-	totalSize           	5293                
+	totalSize           	5301                
 #### A masked pattern was here ####
 	 	 
 # Storage Information	 	 

Modified: hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out (original)
+++ hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out Mon Apr 28 15:40:31 2014
@@ -22,10 +22,10 @@ numFiles	0
 c	3
 #### A masked pattern was here ####
 a	1
-COLUMN_STATS_ACCURATE	false
 #### A masked pattern was here ####
-numRows	-1
+COLUMN_STATS_ACCURATE	false
 totalSize	0
+numRows	-1
 rawDataSize	-1
 FAILED: SemanticException [Error 10215]: Please use the following syntax if not sure whether the property existed or not:
 ALTER TABLE tableName UNSET TBLPROPERTIES IF EXISTS (key1, key2, ...)

Modified: hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out Mon Apr 28 15:40:31 2014
@@ -269,7 +269,7 @@ STAGE PLANS:
       Map Operator Tree:
           TableScan
             alias: s
-            Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
+            Statistics: Num rows: 0 Data size: 16 Basic stats: PARTIAL Column stats: NONE
             Sorted Merge Bucket Map Join Operator
               condition map:
                    Inner Join 0 to 1

Modified: hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out Mon Apr 28 15:40:31 2014
@@ -176,9 +176,9 @@ STAGE PLANS:
       Map Operator Tree:
           TableScan
             alias: t2
-            Statistics: Num rows: 0 Data size: 80294704 Basic stats: PARTIAL Column stats: COMPLETE
+            Statistics: Num rows: 0 Data size: 79536648 Basic stats: PARTIAL Column stats: COMPLETE
             Select Operator
-              Statistics: Num rows: 0 Data size: 80294704 Basic stats: PARTIAL Column stats: COMPLETE
+              Statistics: Num rows: 0 Data size: 79536648 Basic stats: PARTIAL Column stats: COMPLETE
               Group By Operator
                 aggregations: count(1)
                 mode: hash