You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by xu...@apache.org on 2015/11/18 22:45:52 UTC
[04/23] hive git commit: HIVE-11180: Enable native vectorized map
join for spark [Spark Branch] (Rui reviewed by Xuefu)
HIVE-11180: Enable native vectorized map join for spark [Spark Branch] (Rui reviewed by Xuefu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/80f548af
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/80f548af
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/80f548af
Branch: refs/heads/master
Commit: 80f548af3b762abc7775fdfeb21b0d2d9d417c09
Parents: 714b3db
Author: Rui Li <ru...@intel.com>
Authored: Thu Aug 6 13:58:50 2015 +0800
Committer: Rui Li <ru...@intel.com>
Committed: Thu Aug 6 14:09:36 2015 +0800
----------------------------------------------------------------------
.../org/apache/hadoop/hive/conf/HiveConf.java | 4 +-
.../test/resources/testconfiguration.properties | 9 +-
.../persistence/MapJoinTableContainerSerDe.java | 70 +
.../hive/ql/exec/spark/HashTableLoader.java | 18 +-
.../mapjoin/VectorMapJoinCommonOperator.java | 4 +-
.../fast/VectorMapJoinFastTableContainer.java | 2 +-
.../hive/ql/optimizer/physical/Vectorizer.java | 6 +-
.../optimizer/spark/SparkMapJoinOptimizer.java | 10 +
.../spark/vector_inner_join.q.out | 853 +++++++++++
.../spark/vector_outer_join0.q.out | 242 +++
.../spark/vector_outer_join1.q.out | 631 ++++++++
.../spark/vector_outer_join2.q.out | 327 ++++
.../spark/vector_outer_join3.q.out | 630 ++++++++
.../spark/vector_outer_join4.q.out | 1000 +++++++++++++
.../spark/vector_outer_join5.q.out | 1406 ++++++++++++++++++
15 files changed, 5201 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index f593d7d..73610dc 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -765,8 +765,8 @@ public class HiveConf extends Configuration {
HIVEMAPJOINBUCKETCACHESIZE("hive.mapjoin.bucket.cache.size", 100, ""),
HIVEMAPJOINUSEOPTIMIZEDTABLE("hive.mapjoin.optimized.hashtable", true,
- "Whether Hive should use memory-optimized hash table for MapJoin. Only works on Tez,\n" +
- "because memory-optimized hashtable cannot be serialized."),
+ "Whether Hive should use memory-optimized hash table for MapJoin.\n" +
+ "Only works on Tez and Spark, because memory-optimized hashtable cannot be serialized."),
HIVEUSEHYBRIDGRACEHASHJOIN("hive.mapjoin.hybridgrace.hashtable", true, "Whether to use hybrid" +
"grace hash join as the join method for mapjoin. Tez only."),
HIVEHYBRIDGRACEHASHJOINMEMCHECKFREQ("hive.mapjoin.hybridgrace.memcheckfrequency", 1024, "For " +
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index c710b0b..b04c5d5 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -1181,7 +1181,14 @@ miniSparkOnYarn.query.files=auto_sortmerge_join_16.q,\
stats_counter_partitioned.q,\
temp_table_external.q,\
truncate_column_buckets.q,\
- uber_reduce.q
+ uber_reduce.q,\
+ vector_inner_join.q,\
+ vector_outer_join0.q,\
+ vector_outer_join1.q,\
+ vector_outer_join2.q,\
+ vector_outer_join3.q,\
+ vector_outer_join4.q,\
+ vector_outer_join5.q
spark.query.negative.files=groupby2_map_skew_multi_distinct.q,\
groupby2_multi_distinct.q,\
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java
index e97a9f0..d6deabe 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java
@@ -32,7 +32,9 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer;
import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.shims.ShimLoader;
@@ -195,6 +197,74 @@ public class MapJoinTableContainerSerDe {
}
}
+ /**
+ * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path.
+ * @param mapJoinDesc The descriptor for the map join
+ * @param fs FileSystem of the folder.
+ * @param folder The folder to load table container.
+ * @param hconf The hive configuration
+ * @return Loaded table.
+ */
+ @SuppressWarnings("unchecked")
+ public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc,
+ FileSystem fs, Path folder, Configuration hconf) throws HiveException {
+ try {
+ if (!fs.isDirectory(folder)) {
+ throw new HiveException("Error, not a directory: " + folder);
+ }
+ FileStatus[] fileStatuses = fs.listStatus(folder);
+ if (fileStatuses == null || fileStatuses.length == 0) {
+ return null;
+ }
+
+ SerDe keySerDe = keyContext.getSerDe();
+ SerDe valueSerDe = valueContext.getSerDe();
+ Writable key = keySerDe.getSerializedClass().newInstance();
+ Writable value = valueSerDe.getSerializedClass().newInstance();
+
+ VectorMapJoinFastTableContainer tableContainer =
+ new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1);
+
+ for (FileStatus fileStatus : fileStatuses) {
+ Path filePath = fileStatus.getPath();
+ if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
+ throw new HiveException("Error, not a file: " + filePath);
+ }
+ InputStream is = null;
+ ObjectInputStream in = null;
+ try {
+ is = fs.open(filePath, 4096);
+ in = new ObjectInputStream(is);
+ // skip the name and metadata
+ in.readUTF();
+ in.readObject();
+ int numKeys = in.readInt();
+ for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
+ key.readFields(in);
+ long numRows = in.readLong();
+ for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) {
+ value.readFields(in);
+ tableContainer.putRow(null, key, null, value);
+ }
+ }
+ } finally {
+ if (in != null) {
+ in.close();
+ } else if (is != null) {
+ is.close();
+ }
+ }
+ }
+
+ tableContainer.seal();
+ return tableContainer;
+ } catch (IOException e) {
+ throw new HiveException("IO error while trying to create table container", e);
+ } catch (Exception e) {
+ throw new HiveException("Error while trying to create table container", e);
+ }
+ }
+
public void persist(ObjectOutputStream out, MapJoinPersistableTableContainer tableContainer)
throws HiveException {
int numKeys = tableContainer.size();
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java
index 10e3497..c2462a0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java
@@ -46,6 +46,7 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext;
+import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.mapred.JobConf;
@@ -62,6 +63,8 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable
private MapJoinOperator joinOp;
private MapJoinDesc desc;
+ private boolean useFastContainer = false;
+
@Override
public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf,
MapJoinOperator joinOp) {
@@ -69,6 +72,12 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable
this.hconf = hconf;
this.joinOp = joinOp;
this.desc = joinOp.getConf();
+ if (desc.getVectorMode() && HiveConf.getBoolVar(
+ hconf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) {
+ VectorMapJoinDesc vectorDesc = desc.getVectorDesc();
+ useFastContainer = vectorDesc != null && vectorDesc.hashTableImplementationType() ==
+ VectorMapJoinDesc.HashTableImplementationType.FAST;
+ }
}
@Override
@@ -98,7 +107,7 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable
FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
boolean firstContainer = true;
- boolean useOptimizedContainer = HiveConf.getBoolVar(
+ boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(
hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (int pos = 0; pos < mapJoinTables.length; pos++) {
if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
@@ -146,14 +155,17 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable
MapJoinTableContainerSerDe mapJoinTableSerde) throws HiveException {
LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path);
if (!SparkUtilities.isDedicatedCluster(hconf)) {
- return mapJoinTableSerde.load(fs, path, hconf);
+ return useFastContainer ? mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) :
+ mapJoinTableSerde.load(fs, path, hconf);
}
MapJoinTableContainer mapJoinTable = SmallTableCache.get(path);
if (mapJoinTable == null) {
synchronized (path.toString().intern()) {
mapJoinTable = SmallTableCache.get(path);
if (mapJoinTable == null) {
- mapJoinTable = mapJoinTableSerde.load(fs, path, hconf);
+ mapJoinTable = useFastContainer ?
+ mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) :
+ mapJoinTableSerde.load(fs, path, hconf);
SmallTableCache.cache(path, mapJoinTable);
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java
index 87ebcf2..efad421 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java
@@ -541,7 +541,9 @@ public abstract class VectorMapJoinCommonOperator extends MapJoinOperator implem
break;
case FAST:
// Use our specialized hash table loader.
- hashTableLoader = new VectorMapJoinFastHashTableLoader();
+ hashTableLoader = HiveConf.getVar(
+ hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") ?
+ HashTableLoaderFactory.getLoader(hconf) : new VectorMapJoinFastHashTableLoader();
break;
default:
throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name());
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java
index f2080f4..cf6c0e3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java
@@ -195,7 +195,7 @@ public class VectorMapJoinFastTableContainer implements VectorMapJoinTableContai
@Override
public void clear() {
- throw new RuntimeException("Not applicable");
+ // Do nothing
}
@Override
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 82c3e50..4f66cd6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -157,6 +157,7 @@ public class Vectorizer implements PhysicalPlanResolver {
private PhysicalContext physicalContext = null;
private HiveConf hiveConf;
+ private boolean isSpark;
public Vectorizer() {
@@ -873,6 +874,7 @@ public class Vectorizer implements PhysicalPlanResolver {
LOG.info("Vectorization is disabled");
return physicalContext;
}
+ isSpark = (HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark"));
// create dispatcher and graph walker
Dispatcher disp = new VectorizationDispatcher(physicalContext);
TaskGraphWalker ogw = new TaskGraphWalker(disp);
@@ -1444,8 +1446,6 @@ public class Vectorizer implements PhysicalPlanResolver {
Operator<? extends OperatorDesc> vectorOp = null;
Class<? extends Operator<?>> opClass = null;
- boolean isOuterJoin = !desc.getNoOuterJoin();
-
VectorMapJoinDesc.HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE;
VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE;
VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE;
@@ -1666,7 +1666,7 @@ public class Vectorizer implements PhysicalPlanResolver {
case MAPJOIN:
{
MapJoinDesc desc = (MapJoinDesc) op.getConf();
- boolean specialize = canSpecializeMapJoin(op, desc, isTez);
+ boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark);
if (!specialize) {
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
index 39d1f18..46eab65 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
@@ -46,6 +46,8 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OpTraits;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe;
/**
* SparkMapJoinOptimizer cloned from ConvertJoinMapJoin is an optimization that replaces a common join
@@ -89,6 +91,14 @@ public class SparkMapJoinOptimizer implements NodeProcessor {
LOG.info("Convert to non-bucketed map join");
MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos);
+ // For native vectorized map join, we require the key SerDe to be BinarySortableSerDe
+ // Note: the MJ may not really get natively-vectorized later,
+ // but changing SerDe won't hurt correctness
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED) &&
+ conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) {
+ mapJoinOp.getConf().getKeyTblDesc().getProperties().setProperty(
+ serdeConstants.SERIALIZATION_LIB, BinarySortableSerDe.class.getName());
+ }
if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) {
LOG.info("Check if it can be converted to bucketed map join");
numBuckets = convertJoinBucketMapJoin(joinOp, mapJoinOp,
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out b/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out
new file mode 100644
index 0000000..d1b775f
--- /dev/null
+++ b/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out
@@ -0,0 +1,853 @@
+PREHOOK: query: CREATE TABLE orc_table_1a(a INT) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_1a
+POSTHOOK: query: CREATE TABLE orc_table_1a(a INT) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_1a
+PREHOOK: query: CREATE TABLE orc_table_2a(c INT) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_2a
+POSTHOOK: query: CREATE TABLE orc_table_2a(c INT) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_2a
+PREHOOK: query: insert into table orc_table_1a values(1),(1), (2),(3)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@orc_table_1a
+POSTHOOK: query: insert into table orc_table_1a values(1),(1), (2),(3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@orc_table_1a
+POSTHOOK: Lineage: orc_table_1a.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: insert into table orc_table_2a values(0),(2), (3),(null),(4)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__2
+PREHOOK: Output: default@orc_table_2a
+POSTHOOK: query: insert into table orc_table_2a values(0),(2), (3),(null),(4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__2
+POSTHOOK: Output: default@orc_table_2a
+POSTHOOK: Lineage: orc_table_2a.c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: explain
+select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col4
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col4 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1a
+PREHOOK: Input: default@orc_table_2a
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1a
+POSTHOOK: Input: default@orc_table_2a
+#### A masked pattern was here ####
+3
+PREHOOK: query: explain
+select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: a is not null (type: boolean)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: a (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col0 (type: int)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col0
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1a
+PREHOOK: Input: default@orc_table_2a
+#### A masked pattern was here ####
+POSTHOOK: query: select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1a
+POSTHOOK: Input: default@orc_table_2a
+#### A masked pattern was here ####
+3
+PREHOOK: query: CREATE TABLE orc_table_1b(v1 STRING, a INT) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_1b
+POSTHOOK: query: CREATE TABLE orc_table_1b(v1 STRING, a INT) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_1b
+PREHOOK: query: CREATE TABLE orc_table_2b(c INT, v2 STRING) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_2b
+POSTHOOK: query: CREATE TABLE orc_table_2b(c INT, v2 STRING) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_2b
+PREHOOK: query: insert into table orc_table_1b values("one", 1),("one", 1), ("two", 2),("three", 3)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__3
+PREHOOK: Output: default@orc_table_1b
+POSTHOOK: query: insert into table orc_table_1b values("one", 1),("one", 1), ("two", 2),("three", 3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__3
+POSTHOOK: Output: default@orc_table_1b
+POSTHOOK: Lineage: orc_table_1b.a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: orc_table_1b.v1 SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: insert into table orc_table_2b values(0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL>"),(4, "FOUR")
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__4
+PREHOOK: Output: default@orc_table_2b
+POSTHOOK: query: insert into table orc_table_2b values(0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL>"),(4, "FOUR")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__4
+POSTHOOK: Output: default@orc_table_2b
+POSTHOOK: Lineage: orc_table_2b.c EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: orc_table_2b.v2 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: explain
+select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col5 (type: string), _col6 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+three 3
+PREHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col5 (type: string), _col6 (type: int), _col0 (type: int), _col1 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+three 3 3 THREE
+PREHOOK: query: explain
+select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col5 (type: string), (_col6 * 2) (type: int), (_col0 * 5) (type: int), _col1 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+three 6 15 THREE
+PREHOOK: query: explain
+select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col0, _col1, _col5
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col5 (type: string), _col1 (type: string), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+three THREE 3
+PREHOOK: query: explain
+select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 c (type: int)
+ 1 a (type: int)
+ outputColumnNames: _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col6 (type: int), _col5 (type: string), _col1 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+3 three THREE
+PREHOOK: query: explain
+select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ outputColumnNames: _col0, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col6 (type: string), _col5 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+three THREE 3
+PREHOOK: query: explain
+select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (c > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (a > 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col1 (type: int), _col0 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1b
+PREHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1b
+POSTHOOK: Input: default@orc_table_2b
+#### A masked pattern was here ####
+3 three THREE
http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out b/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out
new file mode 100644
index 0000000..cc66db5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out
@@ -0,0 +1,242 @@
+PREHOOK: query: CREATE TABLE orc_table_1(v1 STRING, a INT) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_1
+POSTHOOK: query: CREATE TABLE orc_table_1(v1 STRING, a INT) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_1
+PREHOOK: query: CREATE TABLE orc_table_2(c INT, v2 STRING) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_table_2
+POSTHOOK: query: CREATE TABLE orc_table_2(c INT, v2 STRING) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_table_2
+PREHOOK: query: insert into table orc_table_1 values ("<null1>", null),("one", 1),("one", 1),("two", 2),("three", 3),("<null2>", null)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@orc_table_1
+POSTHOOK: query: insert into table orc_table_1 values ("<null1>", null),("one", 1),("one", 1),("two", 2),("three", 3),("<null2>", null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@orc_table_1
+POSTHOOK: Lineage: orc_table_1.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: orc_table_1.v1 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: insert into table orc_table_2 values (0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL1>"),(4, "FOUR"),(null, "<NULL2>")
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__2
+PREHOOK: Output: default@orc_table_2
+POSTHOOK: query: insert into table orc_table_2 values (0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL1>"),(4, "FOUR"),(null, "<NULL2>")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__2
+POSTHOOK: Output: default@orc_table_2
+POSTHOOK: Lineage: orc_table_2.c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: orc_table_2.v2 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: select * from orc_table_1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from orc_table_1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1
+#### A masked pattern was here ####
+<null1> NULL
+<null2> NULL
+one 1
+one 1
+three 3
+two 2
+PREHOOK: query: select * from orc_table_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from orc_table_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+0 ZERO
+2 TWO
+3 THREE
+4 FOUR
+NULL <NULL1>
+NULL <NULL2>
+PREHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 6 Data size: 550 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 6 Data size: 544 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: int), _col5 (type: int), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: -- SORT_QUERY_RESULTS
+
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1
+PREHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+POSTHOOK: query: -- SORT_QUERY_RESULTS
+
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1
+POSTHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+<null1> NULL NULL NULL
+<null2> NULL NULL NULL
+one 1 NULL NULL
+one 1 NULL NULL
+three 3 3 THREE
+two 2 2 TWO
+PREHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 6 Data size: 544 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ Statistics: Num rows: 6 Data size: 550 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ keys:
+ 0 a (type: int)
+ 1 c (type: int)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: int), _col5 (type: int), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: -- SORT_QUERY_RESULTS
+
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_table_1
+PREHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+POSTHOOK: query: -- SORT_QUERY_RESULTS
+
+select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_table_1
+POSTHOOK: Input: default@orc_table_2
+#### A masked pattern was here ####
+NULL NULL 0 ZERO
+NULL NULL 4 FOUR
+NULL NULL NULL <NULL1>
+NULL NULL NULL <NULL2>
+three 3 3 THREE
+two 2 2 TWO