You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2018/07/03 17:47:34 UTC
[06/46] hive git commit: HIVE-19951: Vectorization: Need to disable
encoded LLAP I/O for ORC when there is data type conversion (Schema
Evolution) (Matt McCline, reviewed by Prasanth Jayachandran)
HIVE-19951: Vectorization: Need to disable encoded LLAP I/O for ORC when there is data type conversion (Schema Evolution) (Matt McCline, reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/d78d6465
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/d78d6465
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/d78d6465
Branch: refs/heads/master-txnstats
Commit: d78d6465a969ef7e7b3363a257f2c4e6f748d0df
Parents: bb531be
Author: Matt McCline <mm...@hortonworks.com>
Authored: Mon Jul 2 08:40:39 2018 -0500
Committer: Matt McCline <mm...@hortonworks.com>
Committed: Mon Jul 2 08:40:39 2018 -0500
----------------------------------------------------------------------
.../test/resources/testconfiguration.properties | 1 +
.../hive/llap/io/api/impl/LlapRecordReader.java | 64 +++++++
.../vector_llap_io_data_conversion.q | 19 ++
.../llap/vector_llap_io_data_conversion.q.out | 187 +++++++++++++++++++
4 files changed, 271 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/d78d6465/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 35fad2c..d415b7d 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -761,6 +761,7 @@ minillaplocal.query.files=\
vector_join_filters.q,\
vector_leftsemi_mapjoin.q,\
vector_like_2.q,\
+ vector_llap_io_data_conversion.q,\
vector_llap_text_1.q,\
vector_mapjoin_reduce.q,\
vector_null_map.q,\
http://git-wip-us.apache.org/repos/asf/hive/blob/d78d6465/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
index 201c097..be748e9 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
@@ -289,8 +289,72 @@ class LlapRecordReader
executor.submit(rp.getReadCallable());
}
+ private boolean hasSchemaEvolutionStringFamilyTruncateIssue(SchemaEvolution evolution) {
+ return hasStringFamilyTruncateTypeIssue(evolution, evolution.getReaderSchema());
+ }
+
+ // We recurse through the types.
+ private boolean hasStringFamilyTruncateTypeIssue(SchemaEvolution evolution,
+ TypeDescription readerType) {
+ TypeDescription fileType = evolution.getFileType(readerType);
+ if (fileType == null) {
+ return false;
+ }
+ switch (fileType.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DOUBLE:
+ case FLOAT:
+ case STRING:
+ case TIMESTAMP:
+ case BINARY:
+ case DATE:
+ case DECIMAL:
+ // We are only looking for the CHAR/VARCHAR truncate issue.
+ return false;
+ case CHAR:
+ case VARCHAR:
+ if (readerType.getCategory().equals(TypeDescription.Category.CHAR) ||
+ readerType.getCategory().equals(TypeDescription.Category.VARCHAR)) {
+ return (fileType.getMaxLength() > readerType.getMaxLength());
+ }
+ return false;
+ case UNION:
+ case MAP:
+ case LIST:
+ case STRUCT:
+ {
+ List<TypeDescription> readerChildren = readerType.getChildren();
+ final int childCount = readerChildren.size();
+ for (int i = 0; i < childCount; ++i) {
+ if (hasStringFamilyTruncateTypeIssue(evolution, readerChildren.get(i))) {
+ return true;
+ }
+ }
+ }
+ return false;
+ default:
+ throw new IllegalArgumentException("Unknown type " + fileType);
+ }
+ }
+
private boolean checkOrcSchemaEvolution() {
SchemaEvolution evolution = rp.getSchemaEvolution();
+
+ /*
+ * FUTURE: When SchemaEvolution.isOnlyImplicitConversion becomes available:
+ * 1) Replace the hasSchemaEvolutionStringFamilyTruncateIssue call with
+ * !isOnlyImplicitConversion.
+ * 2) Delete hasSchemaEvolutionStringFamilyTruncateIssue code.
+ */
+ if (evolution.hasConversion() && hasSchemaEvolutionStringFamilyTruncateIssue(evolution)) {
+
+ // We do not support data type conversion when reading encoded ORC data.
+ return false;
+ }
// TODO: should this just use physical IDs?
for (int i = 0; i < includes.getReaderLogicalColumnIds().size(); ++i) {
int projectedColId = includes.getReaderLogicalColumnIds().get(i);
http://git-wip-us.apache.org/repos/asf/hive/blob/d78d6465/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q b/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q
new file mode 100644
index 0000000..f40c4b9
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q
@@ -0,0 +1,19 @@
+--! qt:dataset:alltypesorc
+set hive.explain.user=false;
+SET hive.vectorized.execution.enabled=true;
+
+set hive.llap.io.enabled=true;
+set hive.llap.io.encode.enabled=true;
+
+create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50))
+ partitioned by(s varchar(50)) stored as orc;
+insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10;
+insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10;
+alter table varchar_single_partition change column vs vs varchar(10);
+
+create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive';
+
+explain vectorization detail
+select * from varchar_ctas_1 order by vs, c1, c2;
+
+select * from varchar_ctas_1 order by vs, c1, c2;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/d78d6465/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out b/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out
new file mode 100644
index 0000000..f503761
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out
@@ -0,0 +1,187 @@
+PREHOOK: query: create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50))
+ partitioned by(s varchar(50)) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@varchar_single_partition
+POSTHOOK: query: create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50))
+ partitioned by(s varchar(50)) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@varchar_single_partition
+PREHOOK: query: insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@varchar_single_partition@s=positive
+POSTHOOK: query: insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@varchar_single_partition@s=positive
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vb EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cbigint, type:bigint, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vd EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cdouble, type:double, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vf EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cfloat, type:float, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vs EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vsi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:csmallint, type:smallint, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vt EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:ctinyint, type:tinyint, comment:null), ]
+PREHOOK: query: insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@varchar_single_partition@s=negative
+POSTHOOK: query: insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@varchar_single_partition@s=negative
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vb EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cbigint, type:bigint, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vd EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cdouble, type:double, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vf EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cfloat, type:float, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vs EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vsi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:csmallint, type:smallint, comment:null), ]
+POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vt EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:ctinyint, type:tinyint, comment:null), ]
+PREHOOK: query: alter table varchar_single_partition change column vs vs varchar(10)
+PREHOOK: type: ALTERTABLE_RENAMECOL
+PREHOOK: Input: default@varchar_single_partition
+PREHOOK: Output: default@varchar_single_partition
+POSTHOOK: query: alter table varchar_single_partition change column vs vs varchar(10)
+POSTHOOK: type: ALTERTABLE_RENAMECOL
+POSTHOOK: Input: default@varchar_single_partition
+POSTHOOK: Output: default@varchar_single_partition
+PREHOOK: query: create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive'
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@varchar_single_partition
+PREHOOK: Input: default@varchar_single_partition@s=positive
+PREHOOK: Output: database:default
+PREHOOK: Output: default@varchar_ctas_1
+POSTHOOK: query: create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive'
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@varchar_single_partition
+POSTHOOK: Input: default@varchar_single_partition@s=positive
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@varchar_ctas_1
+POSTHOOK: Lineage: varchar_ctas_1.c1 EXPRESSION [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ]
+POSTHOOK: Lineage: varchar_ctas_1.c2 EXPRESSION [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ]
+POSTHOOK: Lineage: varchar_ctas_1.vs SIMPLE [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ]
+PREHOOK: query: explain vectorization detail
+select * from varchar_ctas_1 order by vs, c1, c2
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization detail
+select * from varchar_ctas_1 order by vs, c1, c2
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: varchar_ctas_1
+ Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:vs:varchar(10), 1:c1:int, 2:c2:string, 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>]
+ Select Operator
+ expressions: vs (type: varchar(10)), c1 (type: int), c2 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 1, 2]
+ Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: varchar(10)), _col1 (type: int), _col2 (type: string)
+ sort order: +++
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkObjectHashOperator
+ keyColumnNums: [0, 1, 2]
+ native: true
+ nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ valueColumnNums: []
+ Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ allNative: true
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 3
+ includeColumns: [0, 1, 2]
+ dataColumns: vs:varchar(10), c1:int, c2:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: []
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true
+ reduceColumnNullOrder: aaa
+ reduceColumnSortOrder: +++
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 3
+ dataColumns: KEY.reducesinkkey0:varchar(10), KEY.reducesinkkey1:int, KEY.reducesinkkey2:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: []
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: varchar(10)), KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 1, 2]
+ Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select * from varchar_ctas_1 order by vs, c1, c2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@varchar_ctas_1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from varchar_ctas_1 order by vs, c1, c2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@varchar_ctas_1
+#### A masked pattern was here ####
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc
+cvLH6Eat2y 10 y2taE6HLvc