You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by xu...@apache.org on 2014/12/13 18:44:42 UTC
svn commit: r1645338 [1/9] - in /hive/branches/spark: data/conf/spark/
itests/src/test/resources/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/
ql/src/test/results/clientpositive/spark/
Author: xuefu
Date: Sat Dec 13 17:44:41 2014
New Revision: 1645338
URL: http://svn.apache.org/r1645338
Log:
HIVE-8911: Enable mapjoin hints [Spark Branch] (Chao via Xuefu)
Added:
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
Modified:
hive/branches/spark/data/conf/spark/hive-site.xml
hive/branches/spark/itests/src/test/resources/testconfiguration.properties
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join25.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join26.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join27.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join30.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join36.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join37.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join38.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join39.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join40.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/join_map_ppr.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin1.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_filter_on_outerjoin.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_test_outer.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/semijoin.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/skewjoin.q.out
Modified: hive/branches/spark/data/conf/spark/hive-site.xml
URL: http://svn.apache.org/viewvc/hive/branches/spark/data/conf/spark/hive-site.xml?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/data/conf/spark/hive-site.xml (original)
+++ hive/branches/spark/data/conf/spark/hive-site.xml Sat Dec 13 17:44:41 2014
@@ -162,7 +162,7 @@
<property>
<name>hive.ignore.mapjoin.hint</name>
- <value>true</value>
+ <value>false</value>
<description>Whether Hive ignores the mapjoin hint</description>
</property>
Modified: hive/branches/spark/itests/src/test/resources/testconfiguration.properties
URL: http://svn.apache.org/viewvc/hive/branches/spark/itests/src/test/resources/testconfiguration.properties?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/itests/src/test/resources/testconfiguration.properties (original)
+++ hive/branches/spark/itests/src/test/resources/testconfiguration.properties Sat Dec 13 17:44:41 2014
@@ -506,7 +506,6 @@ spark.query.files=add_part_multiple.q, \
auto_sortmerge_join_8.q, \
auto_sortmerge_join_9.q, \
auto_sortmerge_join_10.q, \
- auto_sortmerge_join_11.q, \
auto_sortmerge_join_12.q, \
auto_sortmerge_join_13.q, \
auto_sortmerge_join_14.q, \
@@ -524,7 +523,6 @@ spark.query.files=add_part_multiple.q, \
bucketmapjoin3.q, \
bucketmapjoin4.q, \
bucketmapjoin5.q, \
- bucketmapjoin6.q, \
bucketmapjoin7.q, \
bucketmapjoin8.q, \
bucketmapjoin9.q, \
@@ -671,13 +669,11 @@ spark.query.files=add_part_multiple.q, \
join_cond_pushdown_unqual3.q, \
join_cond_pushdown_unqual4.q, \
join_empty.q \
- join_filters.q, \
join_filters_overlap.q, \
join_hive_626.q, \
join_map_ppr.q, \
join_merge_multi_expressions.q, \
join_merging.q, \
- join_nulls.q, \
join_rc.q, \
join_reorder.q, \
join_reorder2.q, \
@@ -808,21 +804,6 @@ spark.query.files=add_part_multiple.q, \
skewjoin_noskew.q, \
skewjoin_union_remove_1.q, \
skewjoin_union_remove_2.q, \
- smb_mapjoin9.q, \
- smb_mapjoin_1.q, \
- smb_mapjoin_2.q, \
- smb_mapjoin_3.q, \
- smb_mapjoin_4.q, \
- smb_mapjoin_5.q, \
- smb_mapjoin_6.q, \
- smb_mapjoin_7.q, \
- smb_mapjoin_8.q, \
- smb_mapjoin_10.q, \
- smb_mapjoin_13.q, \
- smb_mapjoin_14.q, \
- smb_mapjoin_15.q, \
- smb_mapjoin_16.q, \
- smb_mapjoin_17.q, \
smb_mapjoin_18.q, \
smb_mapjoin_19.q, \
smb_mapjoin_20.q, \
@@ -830,14 +811,6 @@ spark.query.files=add_part_multiple.q, \
smb_mapjoin_22.q, \
smb_mapjoin_25.q, \
sort.q, \
- sort_merge_join_desc_1.q, \
- sort_merge_join_desc_2.q, \
- sort_merge_join_desc_3.q, \
- sort_merge_join_desc_4.q, \
- sort_merge_join_desc_5.q, \
- sort_merge_join_desc_6.q, \
- sort_merge_join_desc_7.q, \
- sort_merge_join_desc_8.q, \
spark_test.q, \
stats_counter.q, \
stats_counter_partitioned.q, \
@@ -951,7 +924,6 @@ spark.query.files=add_part_multiple.q, \
vectorization_part_project.q, \
vectorization_pushdown.q, \
vectorization_short_regress.q, \
- vectorized_bucketmapjoin1.q, \
vectorized_case.q, \
vectorized_mapjoin.q, \
vectorized_math_funcs.q, \
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java Sat Dec 13 17:44:41 2014
@@ -521,7 +521,7 @@ abstract public class AbstractSMBJoinPro
JoinOperator joinOp,
SortBucketJoinProcCtx joinContext,
ParseContext parseContext) throws SemanticException {
- MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin(
+ MapJoinOperator mapJoinOp = new MapJoinProcessor().convertMapJoin(
parseContext.getConf(),
parseContext.getOpParseCtx(),
joinOp,
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java Sat Dec 13 17:44:41 2014
@@ -233,7 +233,7 @@ public class MapJoinProcessor implements
newWork.getMapWork().getOpParseCtxMap();
QBJoinTree newJoinTree = newWork.getMapWork().getJoinTree();
// generate the map join operator; already checked the map join
- MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(conf, opParseCtxMap, op,
+ MapJoinOperator newMapJoinOp = new MapJoinProcessor().convertMapJoin(conf, opParseCtxMap, op,
newJoinTree, mapJoinPos, true, false);
genLocalWorkForMapJoin(newWork, newMapJoinOp, mapJoinPos);
}
@@ -302,8 +302,9 @@ public class MapJoinProcessor implements
* position of the source to be read as part of map-reduce framework. All other sources
* are cached in memory
* @param noCheckOuterJoin
+ * @param validateMapJoinTree
*/
- public static MapJoinOperator convertMapJoin(HiveConf conf,
+ public MapJoinOperator convertMapJoin(HiveConf conf,
LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtxMap,
JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin,
boolean validateMapJoinTree)
@@ -598,7 +599,7 @@ public class MapJoinProcessor implements
return mapJoinPos;
}
- private void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException {
+ protected void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException {
List<Operator<? extends OperatorDesc>> childOps = input.getChildOperators();
input.setChildOperators(null);
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Sat Dec 13 17:44:41 2014
@@ -101,7 +101,9 @@ public class Optimizer {
transformations.add(new RewriteGBUsingIndex());
}
transformations.add(new SamplePruner());
- transformations.add(new MapJoinProcessor());
+
+ MapJoinProcessor mapJoinProcessor = isSparkExecEngine ? new SparkMapJoinProcessor() : new MapJoinProcessor();
+ transformations.add(mapJoinProcessor);
if ((HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) && !isTezExecEngine && !isSparkExecEngine) {
transformations.add(new BucketMapJoinOptimizer());
Added: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java?rev=1645338&view=auto
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java (added)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java Sat Dec 13 17:44:41 2014
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.parse.OpParseContext;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+
+public class SparkMapJoinProcessor extends MapJoinProcessor {
+
+ /**
+ * convert a regular join to a a map-side join.
+ *
+ * @param conf
+ * @param opParseCtxMap
+ * @param op join operator
+ * @param joinTree qb join tree
+ * @param bigTablePos position of the source to be read as part of
+ * map-reduce framework. All other sources are cached in memory
+ * @param noCheckOuterJoin
+ * @param validateMapJoinTree
+ */
+ @Override
+ public MapJoinOperator convertMapJoin(HiveConf conf,
+ LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtxMap,
+ JoinOperator op, QBJoinTree joinTree, int bigTablePos,
+ boolean noCheckOuterJoin,
+ boolean validateMapJoinTree) throws SemanticException {
+
+ // outer join cannot be performed on a table which is being cached
+ JoinCondDesc[] condns = op.getConf().getConds();
+
+ if (!noCheckOuterJoin) {
+ if (checkMapJoin(bigTablePos, condns) < 0) {
+ throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
+ }
+ }
+
+ // create the map-join operator
+ MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, opParseCtxMap,
+ op, joinTree, bigTablePos, noCheckOuterJoin);
+
+ // 1. remove RS as parent for the big table branch
+ // 2. remove old join op from child set of all the RSs
+ List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
+ for (int i = 0; i < parentOps.size(); i++) {
+ Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
+ parentOp.getChildOperators().remove(op);
+ if (i == bigTablePos) {
+ List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
+ Preconditions.checkArgument(grandParentOps.size() == 1,
+ "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
+ Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
+ grandParentOp.replaceChild(parentOp, mapJoinOp);
+ mapJoinOp.replaceParent(parentOp, grandParentOp);
+ }
+ }
+
+ return mapJoinOp;
+ }
+}
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out Sat Dec 13 17:44:41 2014
@@ -104,59 +104,60 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
+ alias: b
+ Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table1
+ base file name: table2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -166,67 +167,93 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table1
- name: default.table1
+ name: default.table2
+ name: default.table2
Truncated Path -> Alias:
- /table1 [a]
- Map 4
+ /table2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
+ alias: a
+ Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table2
+ base file name: table1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -236,47 +263,26 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table2
- name: default.table2
+ name: default.table1
+ name: default.table1
Truncated Path -> Alias:
- /table2 [b]
+ /table1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out Sat Dec 13 17:44:41 2014
@@ -104,59 +104,60 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
+ alias: b
+ Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table1
+ base file name: table2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -166,67 +167,93 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table1
- name: default.table1
+ name: default.table2
+ name: default.table2
Truncated Path -> Alias:
- /table1 [a]
- Map 4
+ /table2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
+ alias: a
+ Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table2
+ base file name: table1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -236,47 +263,26 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table2
- name: default.table2
+ name: default.table1
+ name: default.table1
Truncated Path -> Alias:
- /table2 [b]
+ /table1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out Sat Dec 13 17:44:41 2014
@@ -91,54 +91,26 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Local Work:
+ Map Reduce Local Work
+
Stage: Stage-1
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
- Map 3
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2
- columns.types int:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -227,54 +199,26 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
- Map 3
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2
- columns.types int:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -456,193 +400,196 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: srcbucket_mapjoin
+ base file name: ds=2008-04-08
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ partition_columns.types string
+ rawDataSize 0
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ partition_columns ds
+ partition_columns.types string
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin
- name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin [a]
- Map 3
+ /srcbucket_mapjoin_part/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-08
+ base file name: srcbucket_mapjoin
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- partition values:
- ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 4
- numRows 0
- partition_columns ds
- partition_columns.types string
- rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ COLUMN_STATS_ACCURATE true
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- partition_columns ds
- partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin [a]
Stage: Stage-0
Move Operator
@@ -841,15 +788,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -862,14 +808,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -917,7 +865,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -927,14 +880,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 464
+ rawDataSize 8519
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 8983
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -987,52 +983,6 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
/srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 464
- rawDataSize 8519
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 8983
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -1050,7 +1000,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 464
rawDataSize 8519
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out Sat Dec 13 17:44:41 2014
@@ -192,34 +192,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -232,22 +233,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 2
+ name default.srcbucket_mapjoin_part_2
+ numFiles 3
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -260,16 +261,16 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -279,22 +280,22 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 3
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -307,36 +308,62 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_2/part=2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -349,22 +376,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 3
+ name default.srcbucket_mapjoin_part_1
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -377,16 +404,16 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -396,22 +423,22 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part_1
+ numFiles 3
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -424,41 +451,20 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
- /srcbucket_mapjoin_part_2/part=2 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator