You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2014/04/28 17:40:32 UTC
svn commit: r1590681 [1/3] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/common/
metastore/src/java/org/apache/hadoop/hive/metastore/
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/test/queries/clientpositive/ ql/src/test/results/clientn...
Author: hashutosh
Date: Mon Apr 28 15:40:31 2014
New Revision: 1590681
URL: http://svn.apache.org/r1590681
Log:
HIVE-6979 : Hadoop-2 test failures related to quick stats not being populated correctly (Prasanth J via Ashutosh Chauhan)
Added:
hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q
hive/trunk/ql/src/test/results/clientpositive/union_remove_25.q.out
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java
hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out
hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out
hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out
hive/trunk/ql/src/test/results/clientpositive/filter_numeric.q.out
hive/trunk/ql/src/test/results/clientpositive/groupby_sort_1.q.out
hive/trunk/ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out
hive/trunk/ql/src/test/results/clientpositive/infer_bucket_sort_list_bucket.q.out
hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_6.q.out
hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_7.q.out
hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_8.q.out
hive/trunk/ql/src/test/results/clientpositive/mapjoin_test_outer.q.out
hive/trunk/ql/src/test/results/clientpositive/nullformatCTAS.q.out
hive/trunk/ql/src/test/results/clientpositive/nullgroup3.q.out
hive/trunk/ql/src/test/results/clientpositive/orc_createas1.q.out
hive/trunk/ql/src/test/results/clientpositive/ppd_join4.q.out
hive/trunk/ql/src/test/results/clientpositive/select_dummy_source.q.out
hive/trunk/ql/src/test/results/clientpositive/show_create_table_alter.q.out
hive/trunk/ql/src/test/results/clientpositive/show_tblproperties.q.out
hive/trunk/ql/src/test/results/clientpositive/stats_list_bucket.q.out
hive/trunk/ql/src/test/results/clientpositive/stats_partscan_1_23.q.out
hive/trunk/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
hive/trunk/ql/src/test/results/clientpositive/truncate_column_list_bucket.q.out
hive/trunk/ql/src/test/results/clientpositive/udf_current_database.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_1.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_10.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_12.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_13.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_14.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_19.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_2.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_20.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_21.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_22.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_23.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_24.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_4.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_5.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_7.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_8.q.out
hive/trunk/ql/src/test/results/clientpositive/union_remove_9.q.out
hive/trunk/ql/src/test/results/clientpositive/unset_table_view_property.q.out
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java Mon Apr 28 15:40:31 2014
@@ -18,6 +18,8 @@
package org.apache.hadoop.hive.common;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -46,6 +48,22 @@ public class HiveStatsUtils {
public static FileStatus[] getFileStatusRecurse(Path path, int level, FileSystem fs)
throws IOException {
+ // if level is <0, the return all files/directories under the specified path
+ if ( level < 0) {
+ List<FileStatus> result = new ArrayList<FileStatus>();
+ try {
+ FileStatus fileStatus = fs.getFileStatus(path);
+ FileUtils.listStatusRecursively(fs, fileStatus, result);
+ } catch (IOException e) {
+ // globStatus() API returns empty FileStatus[] when the specified path
+ // does not exist. But getFileStatus() throw IOException. To mimic the
+ // similar behavior we will return empty array on exception. For external
+ // tables, the path of the table will not exists during table creation
+ return new FileStatus[0];
+ }
+ return result.toArray(new FileStatus[result.size()]);
+ }
+
// construct a path pattern (e.g., /*/*) to find all dynamically generated paths
StringBuilder sb = new StringBuilder(path.toUri().getPath());
for (int i = 0; i < level; i++) {
Modified: hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java (original)
+++ hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java Mon Apr 28 15:40:31 2014
@@ -189,13 +189,8 @@ public class MetaStoreUtils {
// Let's try to populate those stats that don't require full scan.
LOG.info("Updating table stats fast for " + tbl.getTableName());
FileStatus[] fileStatus = wh.getFileStatusesForUnpartitionedTable(db, tbl);
- params.put(StatsSetupConst.NUM_FILES, Integer.toString(fileStatus.length));
- long tableSize = 0L;
- for (FileStatus status : fileStatus) {
- tableSize += status.getLen();
- }
- params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tableSize));
- LOG.info("Updated size of table " + tbl.getTableName() +" to "+ Long.toString(tableSize));
+ populateQuickStats(fileStatus, params);
+ LOG.info("Updated size of table " + tbl.getTableName() +" to "+ params.get(StatsSetupConst.TOTAL_SIZE));
if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) {
// invalidate stats requiring scan since this is a regular ddl alter case
for (String stat : StatsSetupConst.statsRequireCompute) {
@@ -213,6 +208,20 @@ public class MetaStoreUtils {
return updated;
}
+ public static void populateQuickStats(FileStatus[] fileStatus, Map<String, String> params) {
+ int numFiles = 0;
+ long tableSize = 0L;
+ for (FileStatus status : fileStatus) {
+ // don't take directories into account for quick stats
+ if (!status.isDir()) {
+ tableSize += status.getLen();
+ numFiles += 1;
+ }
+ }
+ params.put(StatsSetupConst.NUM_FILES, Integer.toString(numFiles));
+ params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(tableSize));
+ }
+
// check if stats need to be (re)calculated
public static boolean requireCalStats(Configuration hiveConf, Partition oldPart,
Partition newPart, Table tbl) {
@@ -285,13 +294,8 @@ public class MetaStoreUtils {
// populate those statistics that don't require a full scan of the data.
LOG.warn("Updating partition stats fast for: " + part.getTableName());
FileStatus[] fileStatus = wh.getFileStatusesForSD(part.getSd());
- params.put(StatsSetupConst.NUM_FILES, Integer.toString(fileStatus.length));
- long partSize = 0L;
- for (int i = 0; i < fileStatus.length; i++) {
- partSize += fileStatus[i].getLen();
- }
- params.put(StatsSetupConst.TOTAL_SIZE, Long.toString(partSize));
- LOG.warn("Updated size to " + Long.toString(partSize));
+ populateQuickStats(fileStatus, params);
+ LOG.warn("Updated size to " + params.get(StatsSetupConst.TOTAL_SIZE));
if(!params.containsKey(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK)) {
// invalidate stats requiring scan since this is a regular ddl alter case
for (String stat : StatsSetupConst.statsRequireCompute) {
Modified: hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java
URL: http://svn.apache.org/viewvc/hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java (original)
+++ hive/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java Mon Apr 28 15:40:31 2014
@@ -496,9 +496,7 @@ public class Warehouse {
try {
Path path = new Path(desc.getLocation());
FileSystem fileSys = path.getFileSystem(conf);
- /* consider sub-directory created from list bucketing. */
- int listBucketingDepth = calculateListBucketingDMLDepth(desc);
- return HiveStatsUtils.getFileStatusRecurse(path, (1 + listBucketingDepth), fileSys);
+ return HiveStatsUtils.getFileStatusRecurse(path, -1, fileSys);
} catch (IOException ioe) {
MetaStoreUtils.logAndThrowMetaException(ioe);
}
@@ -506,28 +504,6 @@ public class Warehouse {
}
/**
- * List bucketing will introduce sub-directories.
- * calculate it here in order to go to the leaf directory
- * so that we can count right number of files.
- * @param desc
- * @return
- */
- private static int calculateListBucketingDMLDepth(StorageDescriptor desc) {
- // list bucketing will introduce more files
- int listBucketingDepth = 0;
- SkewedInfo skewedInfo = desc.getSkewedInfo();
- if ((skewedInfo != null) && (skewedInfo.getSkewedColNames() != null)
- && (skewedInfo.getSkewedColNames().size() > 0)
- && (skewedInfo.getSkewedColValues() != null)
- && (skewedInfo.getSkewedColValues().size() > 0)
- && (skewedInfo.getSkewedColValueLocationMaps() != null)
- && (skewedInfo.getSkewedColValueLocationMaps().size() > 0)) {
- listBucketingDepth = skewedInfo.getSkewedColNames().size();
- }
- return listBucketingDepth;
- }
-
- /**
* @param table
* @return array of FileStatus objects corresponding to the files making up the passed
* unpartitioned table
@@ -537,7 +513,7 @@ public class Warehouse {
Path tablePath = getTablePath(db, table.getTableName());
try {
FileSystem fileSys = tablePath.getFileSystem(conf);
- return HiveStatsUtils.getFileStatusRecurse(tablePath, 1, fileSys);
+ return HiveStatsUtils.getFileStatusRecurse(tablePath, -1, fileSys);
} catch (IOException ioe) {
MetaStoreUtils.logAndThrowMetaException(ioe);
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java Mon Apr 28 15:40:31 2014
@@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
@@ -326,12 +327,7 @@ public class StatsTask extends Task<Stat
* calculate fast statistics
*/
FileStatus[] partfileStatus = wh.getFileStatusesForSD(desc);
- parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(partfileStatus.length));
- long partSize = 0L;
- for (int i = 0; i < partfileStatus.length; i++) {
- partSize += partfileStatus[i].getLen();
- }
- parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(partSize));
+ MetaStoreUtils.populateQuickStats(partfileStatus, parameters);
}
private void clearStats(Map<String, String> parameters) {
Added: hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q?rev=1590681&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/union_remove_25.q Mon Apr 28 15:40:31 2014
@@ -0,0 +1,86 @@
+set hive.stats.autogather=false;
+set hive.optimize.union.remove=true;
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set mapred.input.dir.recursive=true;
+
+-- This is to test the union->selectstar->filesink optimization
+-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink
+-- There is no need to write the temporary results of the sub-queries, and then read them
+-- again to process the union. The union can be removed completely.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- off
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile;
+create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile;
+create table outputTbl2(key string, values bigint) partitioned by (ds string) stored as textfile;
+create table outputTbl3(key string, values bigint) partitioned by (ds string,hr string) stored as textfile;
+
+load data local inpath '../../data/files/T1.txt' into table inputTbl1;
+
+explain
+insert overwrite table outputTbl1 partition(ds='2004')
+SELECT *
+FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, count(1) as values from inputTbl1 group by key
+) a;
+
+insert overwrite table outputTbl1 partition(ds='2004')
+SELECT *
+FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, count(1) as values from inputTbl1 group by key
+) a;
+
+desc formatted outputTbl1 partition(ds='2004');
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+select * from outputTbl1 order by key, values;
+
+explain
+insert overwrite table outputTbl2 partition(ds)
+SELECT *
+FROM (
+ SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+ UNION ALL
+ SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+) a;
+
+insert overwrite table outputTbl2 partition(ds)
+SELECT *
+FROM (
+ SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+ UNION ALL
+ SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500
+) a;
+
+show partitions outputTbl2;
+desc formatted outputTbl2 partition(ds='2008-04-08');
+
+explain insert overwrite table outputTbl3 partition(ds, hr)
+SELECT *
+FROM (
+ SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+ UNION ALL
+ SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+) a;
+
+insert overwrite table outputTbl3 partition(ds, hr)
+SELECT *
+FROM (
+ SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+ UNION ALL
+ SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000
+) a;
+
+show partitions outputTbl3;
+desc formatted outputTbl3 partition(ds='2008-04-08', hr='11');
Modified: hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out (original)
+++ hive/trunk/ql/src/test/results/clientnegative/stats_partialscan_autogether.q.out Mon Apr 28 15:40:31 2014
@@ -79,7 +79,7 @@ Partition Parameters:
numFiles 1
numRows -1
rawDataSize -1
- totalSize 5293
+ totalSize 5301
#### A masked pattern was here ####
# Storage Information
Modified: hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out (original)
+++ hive/trunk/ql/src/test/results/clientnegative/unset_table_property.q.out Mon Apr 28 15:40:31 2014
@@ -22,10 +22,10 @@ numFiles 0
c 3
#### A masked pattern was here ####
a 1
-COLUMN_STATS_ACCURATE false
#### A masked pattern was here ####
-numRows -1
+COLUMN_STATS_ACCURATE false
totalSize 0
+numRows -1
rawDataSize -1
FAILED: SemanticException [Error 10215]: Please use the following syntax if not sure whether the property existed or not:
ALTER TABLE tableName UNSET TBLPROPERTIES IF EXISTS (key1, key2, ...)
Modified: hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/auto_join32.q.out Mon Apr 28 15:40:31 2014
@@ -269,7 +269,7 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: s
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
+ Statistics: Num rows: 0 Data size: 16 Basic stats: PARTIAL Column stats: NONE
Sorted Merge Bucket Map Join Operator
condition map:
Inner Join 0 to 1
Modified: hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out?rev=1590681&r1=1590680&r2=1590681&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucketizedhiveinputformat.q.out Mon Apr 28 15:40:31 2014
@@ -176,9 +176,9 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: t2
- Statistics: Num rows: 0 Data size: 80294704 Basic stats: PARTIAL Column stats: COMPLETE
+ Statistics: Num rows: 0 Data size: 79536648 Basic stats: PARTIAL Column stats: COMPLETE
Select Operator
- Statistics: Num rows: 0 Data size: 80294704 Basic stats: PARTIAL Column stats: COMPLETE
+ Statistics: Num rows: 0 Data size: 79536648 Basic stats: PARTIAL Column stats: COMPLETE
Group By Operator
aggregations: count(1)
mode: hash