You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sa...@apache.org on 2019/09/09 10:10:00 UTC

[hive] branch master updated: HIVE-22178: Parquet FilterPredicate throws CastException after SchemaEvolution (Naresh P R, reviewed by Sankar Hariappan)

This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 47dcfbc  HIVE-22178: Parquet FilterPredicate throws CastException after SchemaEvolution (Naresh P R, reviewed by Sankar Hariappan)
47dcfbc is described below

commit 47dcfbc35675a97f4f09fb28be59d0cbca948552
Author: Naresh P R <pr...@gmail.com>
AuthorDate: Sat Sep 7 13:57:23 2019 +0530

    HIVE-22178: Parquet FilterPredicate throws CastException after SchemaEvolution (Naresh P R, reviewed by Sankar Hariappan)
    
    Signed-off-by: Sankar Hariappan <sa...@apache.org>
---
 .../hive/ql/io/parquet/LeafFilterFactory.java      |  43 ++-
 .../parquet/read/TestParquetFilterPredicate.java   |   4 +-
 .../clientpositive/parquet_schema_evolution.q      |  25 ++
 .../clientpositive/parquet_schema_evolution.q.out  | 359 +++++++++++++++++++++
 4 files changed, 415 insertions(+), 16 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
index be4c0d5..fc9188f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java
@@ -174,23 +174,10 @@ public class LeafFilterFactory {
       Type parquetType) throws HiveException {
     switch (type){
       case LONG:
-        if (parquetType.asPrimitiveType().getPrimitiveTypeName() ==
-            PrimitiveType.PrimitiveTypeName.INT32) {
-          return new IntFilterPredicateLeafBuilder();
-        } else {
-          return new LongFilterPredicateLeafBuilder();
-        }
       case FLOAT:
-        if (parquetType.asPrimitiveType().getPrimitiveTypeName() ==
-            PrimitiveType.PrimitiveTypeName.FLOAT) {
-          return new FloatFilterPredicateLeafBuilder();
-        } else {
-          return new DoubleFilterPredicateLeafBuilder();
-        }
       case STRING:  // string, char, varchar
-        return new BinaryFilterPredicateLeafBuilder();
       case BOOLEAN:
-        return new BooleanFilterPredicateLeafBuilder();
+        return getLeafFilterBuilderByParquetType(parquetType);
       case DATE:
       case DECIMAL:
       case TIMESTAMP:
@@ -200,4 +187,32 @@ public class LeafFilterFactory {
         throw new HiveException(msg);
     }
   }
+
+  /**
+   * Creates FilterPredicateLeafBuilder as per Parquet FileSchema type
+   * @param parquetType
+   * @return
+   * @throws HiveException
+   */
+  private FilterPredicateLeafBuilder getLeafFilterBuilderByParquetType(Type parquetType) throws HiveException {
+    switch (parquetType.asPrimitiveType().getPrimitiveTypeName()){
+      case INT32: // TINYINT, SMALLINT, INT
+        return new IntFilterPredicateLeafBuilder();
+      case INT64: // LONG
+        return new LongFilterPredicateLeafBuilder();
+      case FLOAT:
+        return new FloatFilterPredicateLeafBuilder();
+      case DOUBLE:
+        return new DoubleFilterPredicateLeafBuilder();
+      case BINARY: // STRING, CHAR, VARCHAR
+        return new BinaryFilterPredicateLeafBuilder();
+      case BOOLEAN:
+        return new BooleanFilterPredicateLeafBuilder();
+      default:
+        String msg = "Conversion to Parquet FilterPredicate not supported for "
+            + parquetType.asPrimitiveType().getPrimitiveTypeName();
+        LOG.debug(msg);
+        throw new HiveException(msg);
+    }
+  }
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
index d464046..7c7c657 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/read/TestParquetFilterPredicate.java
@@ -38,14 +38,14 @@ public class TestParquetFilterPredicate {
             .isNull("a", PredicateLeaf.Type.LONG)
             .between("y", PredicateLeaf.Type.LONG, 10L, 20L) // Column will be removed from filter
             .in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L) // Column will be removed from filter
-            .nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger")
+            .nullSafeEquals("stinger", PredicateLeaf.Type.STRING, "stinger")
             .end()
             .end()
             .build();
 
     FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
 
-    String expected = "and(not(eq(a, null)), not(eq(a, Binary{\"stinger\"})))";
+    String expected = "and(not(eq(a, null)), not(eq(stinger, Binary{\"stinger\"})))";
     assertEquals(expected, p.toString());
   }
 
diff --git a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
index 4f593af..a82b641 100644
--- a/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
+++ b/ql/src/test/queries/clientpositive/parquet_schema_evolution.q
@@ -44,3 +44,28 @@ SELECT * FROM schema_test;
 DROP TABLE schema_test;
 DROP TABLE NewStructField;
 DROP TABLE NewStructFieldTable;
+
+drop table if exists parq_test;
+create table parq_test(age int, name string) stored as parquet;
+insert into parq_test values(1, 'aaaa');
+
+DESCRIBE parq_test;
+alter table parq_test change age age string;
+DESCRIBE parq_test;
+
+insert into parq_test values('b', 'bbbb');
+
+select * from parq_test;
+select * from parq_test where age='b';
+select * from parq_test where age='1';
+select * from parq_test where age=1;
+
+explain select * from parq_test where age='b';
+explain select * from parq_test where age='1';
+explain select * from parq_test where age=1;
+
+explain vectorization expression select * from parq_test where age='b';
+explain vectorization expression select * from parq_test where age='1';
+explain vectorization expression select * from parq_test where age=1;
+
+drop table parq_test;
diff --git a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
index 43d75dc..3c38ed0 100644
--- a/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
+++ b/ql/src/test/results/clientpositive/parquet_schema_evolution.q.out
@@ -188,3 +188,362 @@ POSTHOOK: query: DROP TABLE NewStructFieldTable
 POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@newstructfieldtable
 POSTHOOK: Output: default@newstructfieldtable
+PREHOOK: query: drop table if exists parq_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists parq_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table parq_test(age int, name string) stored as parquet
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: create table parq_test(age int, name string) stored as parquet
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parq_test
+PREHOOK: query: insert into parq_test values(1, 'aaaa')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: insert into parq_test values(1, 'aaaa')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@parq_test
+POSTHOOK: Lineage: parq_test.age SCRIPT []
+POSTHOOK: Lineage: parq_test.name SCRIPT []
+PREHOOK: query: DESCRIBE parq_test
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@parq_test
+POSTHOOK: query: DESCRIBE parq_test
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@parq_test
+age                 	int                 	                    
+name                	string              	                    
+PREHOOK: query: alter table parq_test change age age string
+PREHOOK: type: ALTERTABLE_RENAMECOL
+PREHOOK: Input: default@parq_test
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: alter table parq_test change age age string
+POSTHOOK: type: ALTERTABLE_RENAMECOL
+POSTHOOK: Input: default@parq_test
+POSTHOOK: Output: default@parq_test
+PREHOOK: query: DESCRIBE parq_test
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@parq_test
+POSTHOOK: query: DESCRIBE parq_test
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@parq_test
+age                 	string              	                    
+name                	string              	                    
+PREHOOK: query: insert into parq_test values('b', 'bbbb')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: insert into parq_test values('b', 'bbbb')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@parq_test
+POSTHOOK: Lineage: parq_test.age SCRIPT []
+POSTHOOK: Lineage: parq_test.name SCRIPT []
+PREHOOK: query: select * from parq_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1	aaaa
+b	bbbb
+PREHOOK: query: select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+b	bbbb
+PREHOOK: query: select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1	aaaa
+PREHOOK: query: select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+1	aaaa
+PREHOOK: query: explain select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (age = 'b') (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (age = 'b') (type: boolean)
+              Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: 'b' (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (age = '1') (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (age = '1') (type: boolean)
+              Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: '1' (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (UDFToDouble(age) = 1.0D) (type: boolean)
+              Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: age (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age='b'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age='b'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+  enabled: false
+  enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (age = 'b') (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (age = 'b') (type: boolean)
+              Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: 'b' (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age='1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age='1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+  enabled: false
+  enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (age = '1') (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (age = '1') (type: boolean)
+              Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: '1' (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 346 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain vectorization expression select * from parq_test where age=1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression select * from parq_test where age=1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parq_test
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+  enabled: false
+  enabledConditionsNotMet: [hive.vectorized.execution.enabled IS false]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parq_test
+            filterExpr: (UDFToDouble(age) = 1.0D) (type: boolean)
+            Statistics: Num rows: 2 Data size: 544 Basic stats: COMPLETE Column stats: PARTIAL
+            Filter Operator
+              predicate: (UDFToDouble(age) = 1.0D) (type: boolean)
+              Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+              Select Operator
+                expressions: age (type: string), name (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 272 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: drop table parq_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@parq_test
+PREHOOK: Output: default@parq_test
+POSTHOOK: query: drop table parq_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@parq_test
+POSTHOOK: Output: default@parq_test