You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sa...@apache.org on 2019/09/09 10:10:00 UTC
[hive] branch master updated: HIVE-22178: Parquet FilterPredicate
throws CastException after SchemaEvolution (Naresh P R,
reviewed by Sankar Hariappan)
This is an automated email from the ASF dual-hosted git repository.
sankarh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 47dcfbc HIVE-22178: Parquet FilterPredicate throws CastException after SchemaEvolution (Naresh P R, reviewed by Sankar Hariappan)
47dcfbc is described below
commit 47dcfbc35675a97f4f09fb28be59d0cbca948552
Author: Naresh P R <pr...@gmail.com>
AuthorDate: Sat Sep 7 13:57:23 2019 +0530
HIVE-22178: Parquet FilterPredicate throws CastException after SchemaEvolution (Naresh P R, reviewed by Sankar Hariappan)
Signed-off-by: Sankar Hariappan <sa...@apache.org>
---
.../hive/ql/io/parquet/LeafFilterFactory.java | 43 ++-
.../parquet/read/TestParquetFilterPredicate.java | 4 +-
.../clientpositive/parquet_schema_evolution.q | 25 ++
.../clientpositive/parquet_schema_evolution.q.out | 359 +++++++++++++++++++++
4 files changed, 415 insertions(+), 16 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
index be4c0d5..fc9188f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
@@ -174,23 +174,10 @@ public class LeafFilterFactory {
Type parquetType) throws HiveException {
switch (type){
case LONG:
- if (parquetType.asPrimitiveType().getPrimitiveTypeName() ==
- PrimitiveType.PrimitiveTypeName.INT32) {
- return new IntFilterPredicateLeafBuilder();
- } else {
- return new LongFilterPredicateLeafBuilder();
- }
case FLOAT:
- if (parquetType.asPrimitiveType().getPrimitiveTypeName() ==
- PrimitiveType.PrimitiveTypeName.FLOAT) {
- return new FloatFilterPredicateLeafBuilder();
- } else {
- return new DoubleFilterPredicateLeafBuilder();
- }
case STRING: // string, char, varchar
- return new BinaryFilterPredicateLeafBuilder();
case BOOLEAN:
- return new BooleanFilterPredicateLeafBuilder();
+ return getLeafFilterBuilderByParquetType(parquetType);
case DATE:
case DECIMAL:
case TIMESTAMP:
@@ -200,4 +187,32 @@ public class LeafFilterFactory {
throw new HiveException(msg);
}
}
+
+ /**
+ * Creates FilterPredicateLeafBuilder as per Parquet FileSchema type
+ * @param parquetType
+ * @return
+ * @throws HiveException
+ */
+ private FilterPredicateLeafBuilder getLeafFilterBuilderByParquetType(Type parquetType) throws HiveException {
+ switch (parquetType.asPrimitiveType().getPrimitiveTypeName()){
+ case INT32: // TINYINT, SMALLINT, INT
+ return new IntFilterPredicateLeafBuilder();
+ case INT64: // LONG
+ return new LongFilterPredicateLeafBuilder();
+ case FLOAT:
+ return new FloatFilterPredicateLeafBuilder();
+ case DOUBLE:
+ return new DoubleFilterPredicateLeafBuilder();
+ case BINARY: // STRING, CHAR, VARCHAR
+ return new BinaryFilterPredicateLeafBuilder();
+ case BOOLEAN:
+ return new BooleanFilterPredicateLeafBuilder();
+ default:
+ String msg = "Conversion to Parquet FilterPredicate not supported for "
+ + parquetType.asPrimitiveType().getPrimitiveTypeName();
+ LOG.debug(msg);
+ throw new HiveException(msg);
+ }
+ }
}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
index d464046..7c7c657 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
@@ -38,14 +38,14 @@ public class TestParquetFilterPredicate {
.isNull("a", PredicateLeaf.Type.LONG)
.between("y", PredicateLeaf.Type.LONG, 10L, 20L) // Column will be removed from filter
.in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L) // Column will be removed from filter
- .nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger")
+ .nullSafeEquals("stinger", PredicateLeaf.Type.STRING, "stinger")
.end()
.end()
.build();
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
- String expected = "and(not(eq(a, null)), not(eq(a, Binary{\"stinger\"})))";
+ String expected = "and(not(eq(a, null)), not(eq(stinger, Binary{\"stinger\"})))";
assertEquals(expected, p.toString());
}
diff --git a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
index 4f593af..a82b641 100644
--- a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
+++ b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
@@ -44,3 +44,28 @@ SELECT * FROM schema_test;
DROP TABLE schema_test;
DROP TABLE NewStructField;
DROP TABLE NewStructFieldTable;
+
+drop table if exists parq_test;
+create table parq_test(age int, name string) stored as parquet;
+insert into parq_test values(1, 'aaaa');
+
+DESCRIBE parq_test;
+alter table parq_test change age age string;
+DESCRIBE parq_test;
+
+insert into parq_test values('b', 'bbbb');
+
+select * from parq_test;
+select * from parq_test where age='b';
+select * from parq_test where age='1';
+select * from parq_test where age=1;
+
+explain select * from parq_test where age='b';
+explain select * from parq_test where age='1';
+explain select * from parq_test where age=1;
+
+explain vectorization expression select * from parq_test where age='b';
+explain vectorization expression select * from parq_test where age='1';
+explain vectorization expression select * from parq_test where age=1;
+
+drop table parq_test;
diff --git a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
index 43d75dc..3c38ed0 100644
--- a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
+++ b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
@@ -188,3 +188,362 @@ POSTHOOK: query: DROP TABLE NewStructFieldTable
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@newstructfieldtable
POSTHOOK: Output: default@newstructfieldtable
+PREHOOK: query: drop table if exists parq_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists parq_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table parq_test(age int, name string) stored as parquet
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: create table parq_test(age int, name string) stored as parquet
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parq_test
+PREHOOK: query: insert into parq_test values(1, 'aaaa')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: insert into parq_test values(1, 'aaaa')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@parq_test
+POSTHOOK: Lineage: parq_test.age SCRIPT []
+POSTHOOK: Lineage: parq_test.name SCRIPT []
+PREHOOK: query: DESCRIBE parq_test
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@parq_test
+POSTHOOK: query: DESCRIBE parq_test
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@parq_test
+age int
+name string
+PREHOOK: query: alter table parq_test change age age string
+PREHOOK: type: ALTERTABLE_RENAMECOL
+PREHOOK: Input: default@parq_test
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: alter table parq_test change age age string
+POSTHOOK: type: ALTERTABLE_RENAMECOL
+POSTHOOK: Input: default@parq_test
+POSTHOOK: Output: default@parq_test
+PREHOOK: query: DESCRIBE parq_test
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@parq_test
+POSTHOOK: query: DESCRIBE parq_test
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@parq_test
+age string
+name string
+PREHOOK: query: insert into parq_test values('b', 'bbbb')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: insert into parq_test values('b', 'bbbb')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@parq_test
+POSTHOOK: Lineage: parq_test.age SCRIPT []
+POSTHOOK: Lineage: parq_test.name SCRIPT []
+PREHOOK: query: select * from parq_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1 aaaa
+b bbbb
+PREHOOK: query: select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+b bbbb
+PREHOOK: query: select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1 aaaa
+PREHOOK: query: select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1 aaaa
+PREHOOK: query: explain select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (age = 'b') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (age = 'b') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: 'b' (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (age = '1') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (age = '1') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: '1' (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (UDFToDouble(age) = 1.0D) (type: boolean)
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: age (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+ enabled: false
+ enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (age = 'b') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (age = 'b') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: 'b' (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+ enabled: false
+ enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (age = '1') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (age = '1') (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: '1' (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+ enabled: false
+ enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: parq_test
+ filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean)
+ Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (UDFToDouble(age) = 1.0D) (type: boolean)
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: age (type: string), name (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: drop table parq_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@parq_test
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: drop table parq_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@parq_test
+POSTHOOK: Output: default@parq_test