You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/08/17 07:52:11 UTC

svn commit: r1374142 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/ java/org/apache/hadoop/hive/ql/optimizer/ test/queries/clientnegative/ test/queries/clientpositive/ test/results/clientnegative/ test/results/clientpositive/

Author: namit
Date: Fri Aug 17 05:52:11 2012
New Revision: 1374142

URL: http://svn.apache.org/viewvc?rev=1374142&view=rev
Log:
HIVE-3375 bucketed map join should check that the number of files match
the number of buckets (namit via kevin and carl)


Added:
    hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q
    hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q
    hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out
    hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
    hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin9.q
    hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java?rev=1374142&r1=1374141&r2=1374142&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java Fri Aug 17 05:52:11 2012
@@ -246,6 +246,11 @@ public enum ErrorMsg {
   EXPRESSIONS_NOT_ALLOWED_SORTBY(10140,
     "Expressions are not allowed in a sort by clause. Use a column alias instead"),
 
+  BUCKETED_TABLE_METADATA_INCORRECT(10141,
+   "Bucketed table metadata is not correct. " +
+    "Fix the metadata or don't use bucketed mapjoin, by setting " +
+    "hive.enforce.bucketmapjoin to false."),
+
   SCRIPT_INIT_ERROR(20000, "Unable to initialize custom script."),
   SCRIPT_IO_ERROR(20001, "An error occurred while reading or writing to your custom script. "
       + "It may have crashed with an error."),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java?rev=1374142&r1=1374141&r2=1374142&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java Fri Aug 17 05:52:11 2012
@@ -234,12 +234,21 @@ public class BucketMapJoinOptimizer impl
                 return false;
               }
               List<String> fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
+              // The number of files for the table should be same as number of buckets.
+              int bucketCount = p.getBucketCount();
+              if (fileNames.size() != bucketCount) {
+                String msg = "The number of buckets for table " +
+                  tbl.getTableName() + " partition " + p.getName() + " is " +
+                  p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
+                throw new SemanticException(
+                  ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+              }
               if (alias.equals(baseBigAlias)) {
                 bigTblPartsToBucketFileNames.put(p, fileNames);
-                bigTblPartsToBucketNumber.put(p, p.getBucketCount());
+                bigTblPartsToBucketNumber.put(p, bucketCount);
               } else {
                 files.add(fileNames);
-                buckets.add(p.getBucketCount());
+                buckets.add(bucketCount);
               }
             }
             if (!alias.equals(baseBigAlias)) {
@@ -253,6 +262,14 @@ public class BucketMapJoinOptimizer impl
           }
           List<String> fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
           Integer num = new Integer(tbl.getNumBuckets());
+          // The number of files for the table should be same as number of buckets.
+          if (fileNames.size() != num) {
+            String msg = "The number of buckets for table " +
+              tbl.getTableName() + " is " + tbl.getNumBuckets() +
+              ", whereas the number of files is " + fileNames.size();
+            throw new SemanticException(
+              ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+          }
           if (alias.equals(baseBigAlias)) {
             bigTblPartsToBucketFileNames.put(null, fileNames);
             bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());

Added: hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q?rev=1374142&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q Fri Aug 17 05:52:11 2012
@@ -0,0 +1,20 @@
+-- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1;
+drop table table2;
+
+create table table1(key string, value string) clustered by (key, value)
+into 2 BUCKETS stored as textfile;
+create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile;
+
+load data local inpath '../data/files/T1.txt' overwrite into table table1;
+
+load data local inpath '../data/files/T1.txt' overwrite into table table2;
+load data local inpath '../data/files/T2.txt' overwrite into table table2;
+
+set hive.optimize.bucketmapjoin = true;
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
+select /*+ mapjoin(b) */ count(*) from table1 a join table2 b on a.key=b.key and a.value=b.value;
+

Added: hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q?rev=1374142&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q Fri Aug 17 05:52:11 2012
@@ -0,0 +1,24 @@
+-- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1;
+drop table table2;
+
+create table table1(key string, value string) partitioned by (ds string) clustered by (key, value)
+into 2 BUCKETS stored as textfile;
+create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile;
+
+load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1');
+load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1');
+
+load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2');
+
+load data local inpath '../data/files/T1.txt' overwrite into table table2;
+load data local inpath '../data/files/T2.txt' overwrite into table table2;
+
+set hive.optimize.bucketmapjoin = true;
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
+select /*+ mapjoin(b) */ count(*) from table1 a join table2 b
+on a.key=b.key and a.value=b.value and a.ds is not null;
+

Modified: hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin9.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin9.q?rev=1374142&r1=1374141&r2=1374142&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin9.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin9.q Fri Aug 17 05:52:11 2012
@@ -30,10 +30,14 @@ ON a.key = b.key WHERE a.ds = '2010-10-1
 
 set hive.enforce.bucketing = true;
 set hive.enforce.sorting = true;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.exec.reducers.max = 1;
 
 insert overwrite table hive_test_smb_bucket1 partition (ds='2010-10-15') select key, value from src;
 insert overwrite table hive_test_smb_bucket2 partition (ds='2010-10-15') select key, value from src;
 
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
 explain
 create table smb_mapjoin9_results as
 SELECT /* + MAPJOIN(b) */ b.key as k1, b.value, b.ds, a.key as k2

Modified: hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q?rev=1374142&r1=1374141&r2=1374142&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q Fri Aug 17 05:52:11 2012
@@ -4,10 +4,18 @@ create table tmp_smb_bucket_10(userid in
 alter table tmp_smb_bucket_10 add partition (ds = '1');
 alter table tmp_smb_bucket_10 add partition (ds = '2');
 
+-- add dummy files to make sure that the number of files in each partition is same as number of buckets
+ 
+load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1');
+load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1');
+
+load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2');
+load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2');
+
 set hive.optimize.bucketmapjoin = true;
 set hive.optimize.bucketmapjoin.sortedmerge = true;
 set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
- 
+
 explain
 select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b 
 on (a.ds = '1' and b.ds = '2' and

Added: hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out?rev=1374142&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_1.q.out Fri Aug 17 05:52:11 2012
@@ -0,0 +1,45 @@
+PREHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table table2
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table table2
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table table1(key string, value string) clustered by (key, value)
+into 2 BUCKETS stored as textfile
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table table1(key string, value string) clustered by (key, value)
+into 2 BUCKETS stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@table1
+PREHOOK: query: create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@table2
+PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table1
+POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table1
+PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table2
+POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table2
+PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table2
+POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table2
+FAILED: SemanticException [Error 10141]: Bucketed table metadata is not correct. Fix the metadata or don't use bucketed mapjoin, by setting hive.enforce.bucketmapjoin to false. The number of buckets for table table1 is 2, whereas the number of files is 1

Added: hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out?rev=1374142&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/bucket_mapjoin_wrong_table_metadata_2.q.out Fri Aug 17 05:52:11 2012
@@ -0,0 +1,59 @@
+PREHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: -- Although the user has specified a bucketed map-join, the number of buckets in the table
+-- do not match the number of files
+drop table table1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table table2
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table table2
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table table1(key string, value string) partitioned by (ds string) clustered by (key, value)
+into 2 BUCKETS stored as textfile
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table table1(key string, value string) partitioned by (ds string) clustered by (key, value)
+into 2 BUCKETS stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@table1
+PREHOOK: query: create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table table2(key string, value string) clustered by (value, key)
+into 2 BUCKETS stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@table2
+PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table1
+POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='1')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table1
+POSTHOOK: Output: default@table1@ds=1
+PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table1@ds=1
+POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table1 partition (ds='1')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table1@ds=1
+PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table1
+POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table1 partition (ds='2')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table1
+POSTHOOK: Output: default@table1@ds=2
+PREHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table2
+POSTHOOK: query: load data local inpath '../data/files/T1.txt' overwrite into table table2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table2
+PREHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@table2
+POSTHOOK: query: load data local inpath '../data/files/T2.txt' overwrite into table table2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@table2
+FAILED: SemanticException [Error 10141]: Bucketed table metadata is not correct. Fix the metadata or don't use bucketed mapjoin, by setting hive.enforce.bucketmapjoin to false. The number of buckets for table table1 partition ds=1 is 2, whereas the number of files is 1

Modified: hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out?rev=1374142&r1=1374141&r2=1374142&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out Fri Aug 17 05:52:11 2012
@@ -17,6 +17,34 @@ POSTHOOK: query: alter table tmp_smb_buc
 POSTHOOK: type: ALTERTABLE_ADDPARTS
 POSTHOOK: Input: default@tmp_smb_bucket_10
 POSTHOOK: Output: default@tmp_smb_bucket_10@ds=2
+PREHOOK: query: -- add dummy files to make sure that the number of files in each partition is same as number of buckets
+ 
+load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmp_smb_bucket_10@ds=1
+POSTHOOK: query: -- add dummy files to make sure that the number of files in each partition is same as number of buckets
+ 
+load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=1
+PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmp_smb_bucket_10@ds=1
+POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='1')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=1
+PREHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmp_smb_bucket_10@ds=2
+POSTHOOK: query: load data local inpath '../data/files/smbbucket_1.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=2
+PREHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmp_smb_bucket_10@ds=2
+POSTHOOK: query: load data local inpath '../data/files/smbbucket_2.rc' INTO TABLE tmp_smb_bucket_10 partition(ds='2')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=2
 PREHOOK: query: explain
 select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b 
 on (a.ds = '1' and b.ds = '2' and