You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vi...@apache.org on 2018/01/18 16:51:30 UTC
hive git commit: HIVE-18323 : Vectorization: add the support of
timestamp in VectorizedPrimitiveColumnReader for parquet (Vihang Karajgaonkar,
reviewed by Aihua Xu and Ferdinand Xu)
Repository: hive
Updated Branches:
refs/heads/master 80e6f7b0f -> ef9d3ee97
HIVE-18323 : Vectorization: add the support of timestamp in VectorizedPrimitiveColumnReader for parquet (Vihang Karajgaonkar, reviewed by Aihua Xu and Ferdinand Xu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ef9d3ee9
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ef9d3ee9
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ef9d3ee9
Branch: refs/heads/master
Commit: ef9d3ee97cee30725381013f4051790c432aa726
Parents: 80e6f7b
Author: Vihang Karajgaonkar <vi...@cloudera.com>
Authored: Fri Jan 12 17:54:48 2018 -0800
Committer: Vihang Karajgaonkar <vi...@cloudera.com>
Committed: Thu Jan 18 08:30:38 2018 -0800
----------------------------------------------------------------------
.../vector/VectorizedPrimitiveColumnReader.java | 33 +-
.../clientpositive/vectorized_parquet_types.q | 40 +++
.../llap/vectorized_parquet_types.q.out | 300 ++++++++++++++++
.../vectorized_parquet_types.q.out | 357 +++++++++++++++++++
4 files changed, 729 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/ef9d3ee9/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java
index 5e577d2..95faaee 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java
@@ -110,8 +110,10 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader
case DECIMAL:
readDecimal(num, (DecimalColumnVector) column, rowId);
break;
- case INTERVAL_DAY_TIME:
case TIMESTAMP:
+ readTimestamp(num, (TimestampColumnVector) column, rowId);
+ break;
+ case INTERVAL_DAY_TIME:
default:
throw new IOException("Unsupported type: " + type);
}
@@ -288,6 +290,35 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader
}
}
+ private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws IOException {
+ int left = total;
+ while (left > 0) {
+ readRepetitionAndDefinitionLevels();
+ if (definitionLevel >= maxDefLevel) {
+ switch (descriptor.getType()) {
+ //INT64 is not yet supported
+ case INT96:
+ NanoTime nt = NanoTime.fromBinary(dataColumn.readBytes());
+ Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipTimestampConversion);
+ c.set(rowId, ts);
+ break;
+ default:
+ throw new IOException(
+ "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp");
+ }
+ c.isNull[rowId] = false;
+ c.isRepeating =
+ c.isRepeating && ((c.time[0] == c.time[rowId]) && (c.nanos[0] == c.nanos[rowId]));
+ } else {
+ c.isNull[rowId] = true;
+ c.isRepeating = false;
+ c.noNulls = false;
+ }
+ rowId++;
+ left--;
+ }
+ }
+
/**
* Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
*/
http://git-wip-us.apache.org/repos/asf/hive/blob/ef9d3ee9/ql/src/test/queries/clientpositive/vectorized_parquet_types.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vectorized_parquet_types.q b/ql/src/test/queries/clientpositive/vectorized_parquet_types.q
index 63f811b..b122103 100644
--- a/ql/src/test/queries/clientpositive/vectorized_parquet_types.q
+++ b/ql/src/test/queries/clientpositive/vectorized_parquet_types.q
@@ -4,6 +4,7 @@ set hive.llap.cache.allow.synthetic.fileid=true;
DROP TABLE parquet_types_staging;
DROP TABLE parquet_types;
+DROP TABLE IF EXISTS parquet_type_nodict;
-- init
CREATE TABLE parquet_types_staging (
@@ -84,3 +85,42 @@ SELECT ctinyint,
FROM parquet_types
GROUP BY ctinyint
ORDER BY ctinyint;
+
+-- test with dictionary encoding disabled
+create table parquet_type_nodict like parquet_types
+stored as parquet tblproperties ("parquet.enable.dictionary"="false");
+
+insert into parquet_type_nodict
+select * from parquet_types;
+
+
+explain vectorization expression
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict;
+
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict;
+
+explain vectorization expression
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict;
+
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict;
+
+-- test timestamp vectorization
+explain vectorization select max(t), min(t) from parquet_type_nodict;
+select max(t), min(t) from parquet_type_nodict;
+
+-- test timestamp columnVector isRepeating
+create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false");
+
+insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456');
+
+set hive.fetch.task.conversion=none;
+select ts from test where id > 1;
+
+-- test null values in timestamp
+insert into test values (3, NULL);
+select ts from test where id > 1;
+
+DROP TABLE parquet_type_nodict;
+DROP TABLE test;
http://git-wip-us.apache.org/repos/asf/hive/blob/ef9d3ee9/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out b/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out
index 489ae42..05e34d6 100644
--- a/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out
@@ -6,6 +6,10 @@ PREHOOK: query: DROP TABLE parquet_types
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE parquet_types
POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict
+POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE TABLE parquet_types_staging (
cint int,
ctinyint tinyint,
@@ -415,3 +419,299 @@ POSTHOOK: Input: default@parquet_types
1 121 1 8 1.1749999970197678 2.0621590627301285 90.33
2 119 1 7 1.2142857142857142 1.8 60.12
3 120 1 7 1.171428578240531 1.7999999999999996 90.21
+PREHOOK: query: create table parquet_type_nodict like parquet_types
+stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: create table parquet_type_nodict like parquet_types
+stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_type_nodict
+PREHOOK: query: insert into parquet_type_nodict
+select * from parquet_types
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: insert into parquet_type_nodict
+select * from parquet_types
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types
+POSTHOOK: Output: default@parquet_type_nodict
+POSTHOOK: Lineage: parquet_type_nodict.cbinary SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cbinary, type:binary, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cchar, type:char(5), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cdecimal SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdecimal, type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cdouble SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdouble, type:double, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cfloat SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cfloat, type:float, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.csmallint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:csmallint, type:smallint, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cstring1 SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cstring1, type:string, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.ctinyint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:ctinyint, type:tinyint, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cvarchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.t SIMPLE [(parquet_types)parquet_types.FieldSchema(name:t, type:timestamp, comment:null), ]
+PREHOOK: query: explain vectorization expression
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization expression
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Select Operator
+ expressions: cint (type: int), ctinyint (type: tinyint), csmallint (type: smallint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), t (type: timestamp), cchar (type: char(5)), cvarchar (type: varchar(10)), hex(cbinary) (type: string), cdecimal (type: decimal(4,2))
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
+ ListSink
+
+PREHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a B4F3CAFDBEDD 48.88
+101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab 68692CCAC0BDE7 8.72
+102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc B4F3CAFDBEDD 90.21
+103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd 68692CCAC0BDE7 3.89
+104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde B4F3CAFDBEDD 56.23
+105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef 68692CCAC0BDE7 90.21
+106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg B4F3CAFDBEDD 6.09
+107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh 68692CCAC0BDE7 9.44
+108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg B4F3CAFDBE 68656C6C6F 77.54
+109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef 68692CCAC0BDE7 25.42
+110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede B4F3CAFDBEDD 60.12
+111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded 68692CCAC0BDE7 49.56
+112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd B4F3CAFDBEDD 80.76
+113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc 68692CCAC0BDE7 23.23
+114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b B4F3CAFDBEDD 1.01
+115 1 1 1.0 4.5 qrs 2026-04-04 16:16:16.161616161 rstuv abcded 68692CCAC0BDE7 5.98
+116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded B4F3CAFDBEDD 11.22
+117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded 68692CCAC0BDE7 9.88
+118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede B4F3CAFDBEDD 4.76
+119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede 68692CCAC0BDE7 12.83
+120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde B4F3CAFDBEDD 73.04
+121 1 2 1.1 6.3 lmn 2032-10-10 22:22:22.222222222 bcdef abcde 90.33
+PREHOOK: query: explain vectorization expression
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization expression
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Select Operator
+ expressions: cchar (type: char(5)), length(cchar) (type: int), cvarchar (type: varchar(10)), length(cvarchar) (type: int), cdecimal (type: decimal(4,2)), sign(cdecimal) (type: int)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ ListSink
+
+PREHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+a 1 a 3 48.88 1
+ab 2 ab 3 8.72 1
+abc 3 abc 3 90.21 1
+abcd 4 abcd 4 3.89 1
+abcde 5 abcde 5 56.23 1
+abcde 5 abcdef 6 90.21 1
+abcde 5 abcdefg 7 6.09 1
+bcdef 5 abcdefgh 8 9.44 1
+cdefg 5 B4F3CAFDBE 10 77.54 1
+klmno 5 abcdedef 8 25.42 1
+pqrst 5 abcdede 7 60.12 1
+nopqr 5 abcded 6 49.56 1
+opqrs 5 abcdd 5 80.76 1
+pqrst 5 abc 3 23.23 1
+qrstu 5 b 1 1.01 1
+rstuv 5 abcded 6 5.98 1
+stuvw 5 abcded 6 11.22 1
+tuvwx 5 abcded 6 9.88 1
+uvwzy 5 abcdede 7 4.76 1
+vwxyz 5 abcdede 7 12.83 1
+wxyza 5 abcde 5 73.04 1
+bcdef 5 abcde 5 90.33 1
+PREHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Statistics: Num rows: 22 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: t (type: timestamp)
+ outputColumnNames: t
+ Statistics: Num rows: 22 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: max(t), min(t)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: timestamp), _col1 (type: timestamp)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs (cache only)
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: []
+ featureSupportInUse: []
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0), min(VALUE._col1)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select max(t), min(t) from parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: select max(t), min(t) from parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+2032-10-10 22:22:22.222222222 2011-01-01 01:01:01.111111111
+PREHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.id SCRIPT []
+POSTHOOK: Lineage: test.ts SCRIPT []
+PREHOOK: query: select ts from test where id > 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: select ts from test where id > 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+2019-01-01 23:12:45.123456
+2019-01-01 23:12:45.123456
+PREHOOK: query: insert into test values (3, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values (3, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.id SCRIPT []
+POSTHOOK: Lineage: test.ts EXPRESSION []
+PREHOOK: query: select ts from test where id > 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: select ts from test where id > 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+2019-01-01 23:12:45.123456
+2019-01-01 23:12:45.123456
+NULL
+PREHOOK: query: DROP TABLE parquet_type_nodict
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@parquet_type_nodict
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: DROP TABLE parquet_type_nodict
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@parquet_type_nodict
+POSTHOOK: Output: default@parquet_type_nodict
+PREHOOK: query: DROP TABLE test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test
+PREHOOK: Output: default@test
+POSTHOOK: query: DROP TABLE test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test
+POSTHOOK: Output: default@test
http://git-wip-us.apache.org/repos/asf/hive/blob/ef9d3ee9/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out b/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out
index 1a08d46..0dc582f 100644
--- a/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out
+++ b/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out
@@ -6,6 +6,10 @@ PREHOOK: query: DROP TABLE parquet_types
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE parquet_types
POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict
+POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE TABLE parquet_types_staging (
cint int,
ctinyint tinyint,
@@ -478,3 +482,356 @@ POSTHOOK: Input: default@parquet_types
1 121 1 8 1.1749999970197678 2.0621590627301285 90.33
2 119 1 7 1.2142857142857142 1.8 60.12
3 120 1 7 1.171428578240531 1.7999999999999996 90.21
+PREHOOK: query: create table parquet_type_nodict like parquet_types
+stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: create table parquet_type_nodict like parquet_types
+stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_type_nodict
+PREHOOK: query: insert into parquet_type_nodict
+select * from parquet_types
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: insert into parquet_type_nodict
+select * from parquet_types
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types
+POSTHOOK: Output: default@parquet_type_nodict
+POSTHOOK: Lineage: parquet_type_nodict.cbinary SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cbinary, type:binary, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cchar, type:char(5), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cdecimal SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdecimal, type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cdouble SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdouble, type:double, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cfloat SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cfloat, type:float, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.csmallint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:csmallint, type:smallint, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cstring1 SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cstring1, type:string, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.ctinyint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:ctinyint, type:tinyint, comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.cvarchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ]
+POSTHOOK: Lineage: parquet_type_nodict.t SIMPLE [(parquet_types)parquet_types.FieldSchema(name:t, type:timestamp, comment:null), ]
+PREHOOK: query: explain vectorization expression
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization expression
+SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ TableScan Vectorization:
+ native: true
+ Select Operator
+ expressions: cint (type: int), ctinyint (type: tinyint), csmallint (type: smallint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), t (type: timestamp), cchar (type: char(5)), cvarchar (type: varchar(10)), hex(cbinary) (type: string), cdecimal (type: decimal(4,2))
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 10]
+ selectExpressions: VectorUDFAdaptor(hex(cbinary)) -> 12:string
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: []
+ featureSupportInUse: []
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: true
+ vectorized: true
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar,
+hex(cbinary), cdecimal FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a B4F3CAFDBEDD 48.88
+101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab 68692CCAC0BDE7 8.72
+102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc B4F3CAFDBEDD 90.21
+103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd 68692CCAC0BDE7 3.89
+104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde B4F3CAFDBEDD 56.23
+105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef 68692CCAC0BDE7 90.21
+106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg B4F3CAFDBEDD 6.09
+107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh 68692CCAC0BDE7 9.44
+108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg B4F3CAFDBE 68656C6C6F 77.54
+109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef 68692CCAC0BDE7 25.42
+110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede B4F3CAFDBEDD 60.12
+111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded 68692CCAC0BDE7 49.56
+112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd B4F3CAFDBEDD 80.76
+113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc 68692CCAC0BDE7 23.23
+114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b B4F3CAFDBEDD 1.01
+115 1 1 1.0 4.5 qrs 2026-04-04 16:16:16.161616161 rstuv abcded 68692CCAC0BDE7 5.98
+116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded B4F3CAFDBEDD 11.22
+117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded 68692CCAC0BDE7 9.88
+118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede B4F3CAFDBEDD 4.76
+119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede 68692CCAC0BDE7 12.83
+120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde B4F3CAFDBEDD 73.04
+121 1 2 1.1 6.3 lmn 2032-10-10 22:22:22.222222222 bcdef abcde 90.33
+PREHOOK: query: explain vectorization expression
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization expression
+SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ TableScan Vectorization:
+ native: true
+ Select Operator
+ expressions: cchar (type: char(5)), length(cchar) (type: int), cvarchar (type: varchar(10)), length(cvarchar) (type: int), cdecimal (type: decimal(4,2)), sign(cdecimal) (type: int)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [7, 12, 8, 13, 10, 14]
+ selectExpressions: StringLength(col 7:char(5)) -> 12:int, StringLength(col 8:varchar(10)) -> 13:int, FuncSignDecimalToLong(col 10:decimal(4,2)) -> 14:int
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: []
+ featureSupportInUse: []
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+a 1 a 3 48.88 1
+ab 2 ab 3 8.72 1
+abc 3 abc 3 90.21 1
+abcd 4 abcd 4 3.89 1
+abcde 5 abcde 5 56.23 1
+abcde 5 abcdef 6 90.21 1
+abcde 5 abcdefg 7 6.09 1
+bcdef 5 abcdefgh 8 9.44 1
+cdefg 5 B4F3CAFDBE 10 77.54 1
+klmno 5 abcdedef 8 25.42 1
+pqrst 5 abcdede 7 60.12 1
+nopqr 5 abcded 6 49.56 1
+opqrs 5 abcdd 5 80.76 1
+pqrst 5 abc 3 23.23 1
+qrstu 5 b 1 1.01 1
+rstuv 5 abcded 6 5.98 1
+stuvw 5 abcded 6 11.22 1
+tuvwx 5 abcded 6 9.88 1
+uvwzy 5 abcdede 7 4.76 1
+vwxyz 5 abcdede 7 12.83 1
+wxyza 5 abcde 5 73.04 1
+bcdef 5 abcde 5 90.33 1
+PREHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parquet_type_nodict
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: t (type: timestamp)
+ outputColumnNames: t
+ Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: max(t), min(t)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: timestamp), _col1 (type: timestamp)
+ Execution mode: vectorized
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: []
+ featureSupportInUse: []
+ inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Vectorization:
+ enabled: false
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+ enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0), min(VALUE._col1)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select max(t), min(t) from parquet_type_nodict
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+POSTHOOK: query: select max(t), min(t) from parquet_type_nodict
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_type_nodict
+#### A masked pattern was here ####
+2032-10-10 22:22:22.222222222 2011-01-01 01:01:01.111111111
+PREHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.id SCRIPT []
+POSTHOOK: Lineage: test.ts SCRIPT []
+PREHOOK: query: select ts from test where id > 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: select ts from test where id > 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+2019-01-01 23:12:45.123456
+2019-01-01 23:12:45.123456
+PREHOOK: query: insert into test values (3, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values (3, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.id SCRIPT []
+POSTHOOK: Lineage: test.ts EXPRESSION []
+PREHOOK: query: select ts from test where id > 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: select ts from test where id > 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+2019-01-01 23:12:45.123456
+2019-01-01 23:12:45.123456
+NULL
+PREHOOK: query: DROP TABLE parquet_type_nodict
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@parquet_type_nodict
+PREHOOK: Output: default@parquet_type_nodict
+POSTHOOK: query: DROP TABLE parquet_type_nodict
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@parquet_type_nodict
+POSTHOOK: Output: default@parquet_type_nodict
+PREHOOK: query: DROP TABLE test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test
+PREHOOK: Output: default@test
+POSTHOOK: query: DROP TABLE test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test
+POSTHOOK: Output: default@test