You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vi...@apache.org on 2017/11/06 22:26:06 UTC
hive git commit: HIVE-17874 : Parquet vectorization fails on tables
with complex columns when there are no projected columns (Vihang Karajgaonkar,
reviewed by Ferdinand Xu)
Repository: hive
Updated Branches:
refs/heads/branch-2 e2d5d0005 -> 307f58270
HIVE-17874 : Parquet vectorization fails on tables with complex columns when there are no projected columns (Vihang Karajgaonkar, reviewed by Ferdinand Xu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/307f5827
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/307f5827
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/307f5827
Branch: refs/heads/branch-2
Commit: 307f58270b7f7abd7311f192540fc969cd379869
Parents: e2d5d00
Author: Vihang Karajgaonkar <vi...@cloudera.com>
Authored: Mon Nov 6 14:16:07 2017 -0800
Committer: Vihang Karajgaonkar <vi...@cloudera.com>
Committed: Mon Nov 6 14:16:07 2017 -0800
----------------------------------------------------------------------
.../test/resources/testconfiguration.properties | 1 +
.../vector/VectorizedParquetRecordReader.java | 28 +-
.../vectorization_parquet_projection.q | 77 ++++
.../vectorization_parquet_projection.q.out | 459 +++++++++++++++++++
.../vectorization_parquet_projection.q.out | 426 +++++++++++++++++
5 files changed, 979 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 639ffa8..e2c59f2 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -1343,6 +1343,7 @@ spark.query.files=add_part_multiple.q, \
vectorization_not.q, \
vectorization_part.q, \
vectorization_part_project.q, \
+ vectorization_parquet_projection.q, \
vectorization_pushdown.q, \
vectorization_short_regress.q, \
vectorized_case.q, \
http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
index 9359098..190c639 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
@@ -72,7 +72,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
private List<String> columnNamesList;
private List<TypeInfo> columnTypesList;
private VectorizedRowBatchCtx rbCtx;
- private List<Integer> indexColumnsWanted;
private Object[] partitionValues;
/**
@@ -105,8 +104,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
serDeStats = new SerDeStats();
projectionPusher = new ProjectionPusher();
initialize(inputSplit, conf);
- colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf);
- rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
} catch (Throwable e) {
LOG.error("Failed to create the vectorized reader due to exception " + e);
throw new RuntimeException(e);
@@ -123,8 +120,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
if (inputSplit != null) {
initialize(inputSplit, conf);
}
- colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf);
- rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
initPartitionValues((FileSplit) oldInputSplit, conf);
} catch (Throwable e) {
LOG.error("Failed to create the vectorized reader due to exception " + e);
@@ -145,11 +140,14 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
public void initialize(
InputSplit oldSplit,
JobConf configuration) throws IOException, InterruptedException {
+ colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
+ //initialize the rowbatchContext
+ jobConf = configuration;
+ rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf);
// the oldSplit may be null during the split phase
if (oldSplit == null) {
return;
}
- jobConf = configuration;
ParquetMetadata footer;
List<BlockMetaData> blocks;
ParquetInputSplit split = (ParquetInputSplit) oldSplit;
@@ -206,7 +204,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
}
this.fileSchema = footer.getFileMetaData().getSchema();
- indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
+ colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
requestedSchema = DataWritableReadSupport
.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
@@ -294,11 +292,17 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
- if (!ColumnProjectionUtils.isReadAllColumns(jobConf) && !indexColumnsWanted.isEmpty()) {
- for (int i = 0; i < types.size(); ++i) {
- columnReaders[i] =
- buildVectorizedParquetReader(columnTypesList.get(indexColumnsWanted.get(i)), types.get(i),
- pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
+ if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
+ //certain queries like select count(*) from table do not have
+ //any projected columns and still have isReadAllColumns as false
+ //in such cases columnReaders are not needed
+ //However, if colsToInclude is not empty we should initialize each columnReader
+ if(!colsToInclude.isEmpty()) {
+ for (int i = 0; i < types.size(); ++i) {
+ columnReaders[i] =
+ buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i),
+ pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
+ }
}
} else {
for (int i = 0; i < types.size(); ++i) {
http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
new file mode 100644
index 0000000..76fbf0e
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
@@ -0,0 +1,77 @@
+set hive.fetch.task.conversion=none;
+set hive.compute.query.using.stats=false;
+set hive.vectorized.use.row.serde.deserialize=false;
+set hive.vectorized.use.vector.serde.deserialize=false;
+set hive.vectorized.execution.enabled=true;
+set hive.vectorized.execution.reduce.enabled=true;
+set hive.mapred.mode=nonstrict;
+set hive.llap.cache.allow.synthetic.fileid=true;
+
+-- SORT_QUERY_RESULTS
+
+DROP TABLE IF EXISTS parquet_types_staging;
+
+CREATE TABLE parquet_types_staging (
+ cint int,
+ ctinyint tinyint,
+ csmallint smallint,
+ cfloat float,
+ cdouble double,
+ cstring1 string,
+ t timestamp,
+ cchar char(5),
+ cvarchar varchar(10),
+ cbinary string,
+ m1 map<string, varchar(3)>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:char(1)>,
+ d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':';
+
+LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging;
+
+-- test various number of projected columns
+
+DROP TABLE IF EXISTS parquet_project_test;
+
+CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET;
+
+insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1;
+
+insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2;
+
+insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3;
+
+-- no columns in the projection
+explain vectorization select * from parquet_project_test;
+select * from parquet_project_test;
+
+-- no columns in the projection, just count(*)
+explain vectorization select count(*) from parquet_project_test;
+select count(*) from parquet_project_test;
+
+-- project a primitive type
+explain vectorization select cint, count(*) from parquet_project_test
+group by cint;
+
+select cint, count(*) from parquet_project_test
+group by cint;
+
+-- test complex type in projection, this should not get vectorized
+explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"];
+
+select m1["color"], count(*) from parquet_project_test
+group by m1["color"];
http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
new file mode 100644
index 0000000..c27e61c
--- /dev/null
+++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
@@ -0,0 +1,459 @@
+PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_types_staging (
+ cint int,
+ ctinyint tinyint,
+ csmallint smallint,
+ cfloat float,
+ cdouble double,
+ cstring1 string,
+ t timestamp,
+ cchar char(5),
+ cvarchar varchar(10),
+ cbinary string,
+ m1 map<string, varchar(3)>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:char(1)>,
+ d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: CREATE TABLE parquet_types_staging (
+ cint int,
+ ctinyint tinyint,
+ csmallint smallint,
+ cfloat float,
+ cdouble double,
+ cstring1 string,
+ t timestamp,
+ cchar char(5),
+ cvarchar varchar(10),
+ cbinary string,
+ m1 map<string, varchar(3)>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:char(1)>,
+ d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_project_test
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: explain vectorization select * from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select * from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: cint (type: int), m1 (type: map<string,string>)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ notVectorizedReason: Select expression for SELECT operator: Data type map<string,string> of Column[m1] not supported
+ vectorized: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select * from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+PREHOOK: query: explain vectorization select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ groupByVectorOutput: true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true
+ groupByVectorOutput: true
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+22
+PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: cint (type: int)
+ outputColumnNames: cint
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: cint (type: int)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ groupByVectorOutput: true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true
+ groupByVectorOutput: true
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1 8
+2 7
+3 7
+PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: m1['color'] (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: _col0 (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ notVectorizedReason: Select expression for SELECT operator: Data type map<string,string> of Column[m1] not supported
+ vectorized: false
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true
+ groupByVectorOutput: true
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+blue 7
+green 7
+red 8
http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
new file mode 100644
index 0000000..02a28de
--- /dev/null
+++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
@@ -0,0 +1,426 @@
+PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_types_staging (
+ cint int,
+ ctinyint tinyint,
+ csmallint smallint,
+ cfloat float,
+ cdouble double,
+ cstring1 string,
+ t timestamp,
+ cchar char(5),
+ cvarchar varchar(10),
+ cbinary string,
+ m1 map<string, varchar(3)>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:char(1)>,
+ d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: CREATE TABLE parquet_types_staging (
+ cint int,
+ ctinyint tinyint,
+ csmallint smallint,
+ cfloat float,
+ cdouble double,
+ cstring1 string,
+ t timestamp,
+ cchar char(5),
+ cvarchar varchar(10),
+ cbinary string,
+ m1 map<string, varchar(3)>,
+ l1 array<int>,
+ st1 struct<c1:int, c2:char(1)>,
+ d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_project_test
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: explain vectorization select * from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select * from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: cint (type: int), m1 (type: map<string,string>)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ notVectorizedReason: Select expression for SELECT operator: Data type map<string,string> of Column[m1] not supported
+ vectorized: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select * from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+1 {"color":"red"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+2 {"color":"green"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+3 {"color":"blue"}
+PREHOOK: query: explain vectorization select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ groupByVectorOutput: true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Vectorization:
+ enabled: false
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+ enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+22
+PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: cint (type: int)
+ outputColumnNames: cint
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: cint (type: int)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ groupByVectorOutput: true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Vectorization:
+ enabled: false
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+ enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1 8
+2 7
+3 7
+PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_project_test
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: m1['color'] (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: _col0 (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ notVectorizedReason: Select expression for SELECT operator: Data type map<string,string> of Column[m1] not supported
+ vectorized: false
+ Reduce Vectorization:
+ enabled: false
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+ enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+blue 7
+green 7
+red 8