You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/10/06 04:40:00 UTC
[hive] branch master updated: HIVE-24209 : Incorrect search
argument conversion for NOT BETWEEN operation when vectorization is enabled
(Ganesha Shreedhara via Ashutosh Chauhan)
This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 041b9c2 HIVE-24209 : Incorrect search argument conversion for NOT BETWEEN operation when vectorization is enabled (Ganesha Shreedhara via Ashutosh Chauhan)
041b9c2 is described below
commit 041b9c25c3694c3fa09b132705eecccab96c6385
Author: Ganesha Shreedhara <ga...@gmail.com>
AuthorDate: Mon Oct 5 21:39:18 2020 -0700
HIVE-24209 : Incorrect search argument conversion for NOT BETWEEN operation when vectorization is enabled (Ganesha Shreedhara via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
.../hive/ql/io/sarg/ConvertAstToSearchArg.java | 9 +-
.../clientpositive/vector_between_columns.q | 43 +++++
.../llap/vector_between_columns.q.out | 211 +++++++++++++++++++++
3 files changed, 262 insertions(+), 1 deletion(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java
index 764c401..fd24eaa 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java
@@ -471,7 +471,14 @@ public class ConvertAstToSearchArg {
} else if (op == GenericUDFIn.class) {
createLeaf(PredicateLeaf.Operator.IN, expr, 0);
} else if (op == GenericUDFBetween.class) {
- createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1);
+ // Start with NOT operator when the first child of GenericUDFBetween operator is set to TRUE
+ if (Boolean.TRUE.equals(((ExprNodeConstantDesc) expression.getChildren().get(0)).getValue())) {
+ builder.startNot();
+ createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1);
+ builder.end();
+ } else {
+ createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1);
+ }
} else if (op == GenericUDFOPNull.class) {
createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0);
} else if (op == GenericUDFOPNotNull.class) {
diff --git a/ql/src/test/queries/clientpositive/vector_between_columns.q b/ql/src/test/queries/clientpositive/vector_between_columns.q
index a8e9ca4..63142f3 100644
--- a/ql/src/test/queries/clientpositive/vector_between_columns.q
+++ b/ql/src/test/queries/clientpositive/vector_between_columns.q
@@ -5,6 +5,7 @@ SET hive.auto.convert.join=true;
set hive.fetch.task.conversion=none;
set hive.mapred.mode=nonstrict;
set hive.join.inner.residual=false;
+set hive.optimize.index.filter=true;
-- SORT_QUERY_RESULTS
--
@@ -30,6 +31,43 @@ create table TINT stored as orc AS SELECT * FROM TINT_txt;
-- Add a single NULL row that will come from ORC as isRepeated.
insert into TINT values (NULL, NULL);
+CREATE EXTERNAL TABLE test_orc_ppd(
+ data_release bigint,
+ data_owner_ver_id bigint,
+ data_owner_dim_id bigint,
+ data_source_ver_id bigint,
+ data_source_dim_id bigint,
+ data_client_ver_id bigint,
+ data_client_dim_id bigint,
+ data_client_sub_ver_id bigint,
+ data_client_sub_dim_id bigint,
+ quarter_dim_id bigint,
+ market_dim_id bigint,
+ daypart_dim_id bigint,
+ demo_dim_id bigint,
+ station_dim_id bigint,
+ medium_dim_id bigint,
+ ad_length int,
+ exclude int,
+ population int,
+ client_cpp double,
+ client_cpm double,
+ low_cpp double,
+ mid_cpp double,
+ high_cpp double,
+ low_cpm double,
+ mid_cpm double,
+ high_cpm double,
+ low_cpp_index double,
+ mid_cpp_index double,
+ high_cpp_index double,
+ low_cpm_index double,
+ mid_cpm_index double,
+ high_cpm_index double)
+ STORED AS ORC;
+LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd'
+OVERWRITE INTO TABLE test_orc_ppd;
+
explain vectorization expression
select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint;
@@ -40,3 +78,8 @@ explain vectorization expression
select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint;
select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint;
+
+explain vectorization expression
+select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release;
+
+select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release;
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out b/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out
index 546dc45..cb6d521 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out
@@ -82,6 +82,88 @@ POSTHOOK: Output: default@tint
POSTHOOK: Lineage: tint.cint EXPRESSION []
POSTHOOK: Lineage: tint.rnum EXPRESSION []
_col0 _col1
+PREHOOK: query: CREATE EXTERNAL TABLE test_orc_ppd(
+ data_release bigint,
+#### A masked pattern was here ####
+ data_source_ver_id bigint,
+ data_source_dim_id bigint,
+ data_client_ver_id bigint,
+ data_client_dim_id bigint,
+ data_client_sub_ver_id bigint,
+ data_client_sub_dim_id bigint,
+ quarter_dim_id bigint,
+ market_dim_id bigint,
+ daypart_dim_id bigint,
+ demo_dim_id bigint,
+ station_dim_id bigint,
+ medium_dim_id bigint,
+ ad_length int,
+ exclude int,
+ population int,
+ client_cpp double,
+ client_cpm double,
+ low_cpp double,
+ mid_cpp double,
+ high_cpp double,
+ low_cpm double,
+ mid_cpm double,
+ high_cpm double,
+ low_cpp_index double,
+ mid_cpp_index double,
+ high_cpp_index double,
+ low_cpm_index double,
+ mid_cpm_index double,
+ high_cpm_index double)
+ STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_orc_ppd
+POSTHOOK: query: CREATE EXTERNAL TABLE test_orc_ppd(
+ data_release bigint,
+#### A masked pattern was here ####
+ data_source_ver_id bigint,
+ data_source_dim_id bigint,
+ data_client_ver_id bigint,
+ data_client_dim_id bigint,
+ data_client_sub_ver_id bigint,
+ data_client_sub_dim_id bigint,
+ quarter_dim_id bigint,
+ market_dim_id bigint,
+ daypart_dim_id bigint,
+ demo_dim_id bigint,
+ station_dim_id bigint,
+ medium_dim_id bigint,
+ ad_length int,
+ exclude int,
+ population int,
+ client_cpp double,
+ client_cpm double,
+ low_cpp double,
+ mid_cpp double,
+ high_cpp double,
+ low_cpm double,
+ mid_cpm double,
+ high_cpm double,
+ low_cpp_index double,
+ mid_cpp_index double,
+ high_cpp_index double,
+ low_cpm_index double,
+ mid_cpm_index double,
+ high_cpm_index double)
+ STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_orc_ppd
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd'
+OVERWRITE INTO TABLE test_orc_ppd
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@test_orc_ppd
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd'
+OVERWRITE INTO TABLE test_orc_ppd
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@test_orc_ppd
Warning: Shuffle Join MERGEJOIN[9][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product
PREHOOK: query: explain vectorization expression
select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint
@@ -413,3 +495,132 @@ tint.rnum tsint.rnum tint.cint tsint.csint
2 2 0 0
3 3 1 1
4 4 10 10
+PREHOOK: query: explain vectorization expression
+select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc_ppd
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization expression
+select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc_ppd
+#### A masked pattern was here ####
+Explain
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: test_orc_ppd
+ filterExpr: data_release NOT BETWEEN 20191201L AND 20200101L (type: boolean)
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ TableScan Vectorization:
+ native: true
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: FilterLongColumnNotBetween(col 0:bigint, left 20191201, right 20200101)
+ predicate: data_release NOT BETWEEN 20191201L AND 20200101L (type: boolean)
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ Group By Vectorization:
+ aggregators: VectorUDAFCountStar(*) -> bigint
+ className: VectorGroupByOperator
+ groupByMode: HASH
+ keyExpressions: col 0:bigint
+ native: false
+ vectorProcessingMode: HASH
+ projectedOutputColumnNums: [0]
+ keys: data_release (type: bigint)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: bigint)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: bigint)
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkLongOperator
+ native: true
+ nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ Group By Vectorization:
+ aggregators: VectorUDAFCountMerge(col 1:bigint) -> bigint
+ className: VectorGroupByOperator
+ groupByMode: MERGEPARTIAL
+ keyExpressions: col 0:bigint
+ native: false
+ vectorProcessingMode: MERGE_PARTIAL
+ projectedOutputColumnNums: [0]
+ keys: KEY._col0 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc_ppd
+#### A masked pattern was here ####
+POSTHOOK: query: select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc_ppd
+#### A masked pattern was here ####
+data_release _c1
+20190301 1024