You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2015/09/05 02:33:04 UTC
[3/3] hive git commit: HIVE-10802 : Table join query with some
constant field in select fails (Aihua Xu via Ashutosh Chauhan)
HIVE-10802 : Table join query with some constant field in select fails (Aihua Xu via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1c80f2e5
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1c80f2e5
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1c80f2e5
Branch: refs/heads/branch-1.2
Commit: 1c80f2e5ab6a03005bbaf9d465e9740f6fc2f8d9
Parents: e1b7bf7
Author: Aihua Xu <ai...@gmail.com>
Authored: Sat May 30 15:13:00 2015 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Fri Sep 4 17:28:20 2015 -0700
----------------------------------------------------------------------
.../optimizer/ConstantPropagateProcFactory.java | 57 +++++----
.../clientpositive/udf_nondeterministic.q | 11 ++
.../clientpositive/spark/udf_in_file.q.out | 2 +-
.../results/clientpositive/udf_in_file.q.out | 2 +-
.../clientpositive/udf_nondeterministic.q.out | 120 +++++++++++++++++++
5 files changed, 164 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/1c80f2e5/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
index 20e23d5..f9df8e5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java
@@ -263,9 +263,13 @@ public final class ConstantPropagateProcFactory {
/**
* Fold input expression desc.
*
- * If desc is a UDF and all parameters are constants, evaluate it. If desc is a column expression,
- * find it from propagated constants, and if there is, replace it with constant.
- *
+ * This function recursively checks if any subexpression of a specified expression
+ * can be evaluated to be constant and replaces such subexpression with the constant.
+ * If the expression is a derterministic UDF and all the subexpressions are constants,
+ * the value will be calculated immediately (during compilation time vs. runtime).
+ * e.g.:
+ * concat(year, month) => 200112 for year=2001, month=12 since concat is deterministic UDF
+ * unix_timestamp(time) => unix_timestamp(123) for time=123 since unix_timestamp is nonderministic UDF
* @param desc folding expression
* @param constants current propagated constant map
* @param cppCtx
@@ -280,12 +284,7 @@ public final class ConstantPropagateProcFactory {
if (desc instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) desc;
- // The function must be deterministic, or we can't fold it.
GenericUDF udf = funcDesc.getGenericUDF();
- if (!isDeterministicUdf(udf)) {
- LOG.debug("Function " + udf.getClass() + " undeterministic, quit folding.");
- return desc;
- }
boolean propagateNext = propagate && propagatableUdfs.contains(udf.getClass());
List<ExprNodeDesc> newExprs = new ArrayList<ExprNodeDesc>();
@@ -293,27 +292,33 @@ public final class ConstantPropagateProcFactory {
newExprs.add(foldExpr(childExpr, constants, cppCtx, op, tag, propagateNext));
}
- // If all child expressions are constants, evaluate UDF immediately
- ExprNodeDesc constant = evaluateFunction(udf, newExprs, desc.getChildren());
- if (constant != null) {
- LOG.debug("Folding expression:" + desc + " -> " + constant);
- return constant;
+ // Don't evalulate nondeterministic function since the value can only calculate during runtime.
+ if (!isDeterministicUdf(udf)) {
+ LOG.debug("Function " + udf.getClass() + " is undeterministic. Don't evalulating immediately.");
+ ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
+ return desc;
} else {
-
- // Check if the function can be short cut.
- ExprNodeDesc shortcut = shortcutFunction(udf, newExprs, op);
- if (shortcut != null) {
- LOG.debug("Folding expression:" + desc + " -> " + shortcut);
- return shortcut;
+ // If all child expressions of deterministic function are constants, evaluate such UDF immediately
+ ExprNodeDesc constant = evaluateFunction(udf, newExprs, desc.getChildren());
+ if (constant != null) {
+ LOG.debug("Folding expression:" + desc + " -> " + constant);
+ return constant;
+ } else {
+ // Check if the function can be short cut.
+ ExprNodeDesc shortcut = shortcutFunction(udf, newExprs, op);
+ if (shortcut != null) {
+ LOG.debug("Folding expression:" + desc + " -> " + shortcut);
+ return shortcut;
+ }
+ ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
}
- ((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
- }
- // If in some selected binary operators (=, is null, etc), one of the
- // expressions are
- // constant, add them to colToConstatns as half-deterministic columns.
- if (propagate) {
- propagate(udf, newExprs, op.getSchema(), constants);
+ // If in some selected binary operators (=, is null, etc), one of the
+ // expressions are
+ // constant, add them to colToConstants as half-deterministic columns.
+ if (propagate) {
+ propagate(udf, newExprs, op.getSchema(), constants);
+ }
}
return desc;
http://git-wip-us.apache.org/repos/asf/hive/blob/1c80f2e5/ql/src/test/queries/clientpositive/udf_nondeterministic.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_nondeterministic.q b/ql/src/test/queries/clientpositive/udf_nondeterministic.q
new file mode 100644
index 0000000..fcb5162
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_nondeterministic.q
@@ -0,0 +1,11 @@
+drop table if exists udf_tb1;
+drop table if exists udf_tb2;
+
+create table udf_tb1 (year int, month int);
+create table udf_tb2(month int);
+insert into udf_tb1 values(2001, 11);
+insert into udf_tb2 values(11);
+
+explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month);
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month);
http://git-wip-us.apache.org/repos/asf/hive/blob/1c80f2e5/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/udf_in_file.q.out b/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
index c5b6205..5ba62b0 100644
--- a/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
+++ b/ql/src/test/results/clientpositive/spark/udf_in_file.q.out
@@ -59,7 +59,7 @@ STAGE PLANS:
alias: value_src
Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(UDFToString(null), '../../data/files/test2.dat') (type: boolean)
+ expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(null, '../../data/files/test2.dat') (type: boolean)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
Limit
http://git-wip-us.apache.org/repos/asf/hive/blob/1c80f2e5/ql/src/test/results/clientpositive/udf_in_file.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_in_file.q.out b/ql/src/test/results/clientpositive/udf_in_file.q.out
index b9d0d25..6d2c84b 100644
--- a/ql/src/test/results/clientpositive/udf_in_file.q.out
+++ b/ql/src/test/results/clientpositive/udf_in_file.q.out
@@ -59,7 +59,7 @@ STAGE PLANS:
alias: value_src
Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
Select Operator
- expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(UDFToString(null), '../../data/files/test2.dat') (type: boolean)
+ expressions: in_file(str_val, '../../data/files/test2.dat') (type: boolean), in_file(ch_val, '../../data/files/test2.dat') (type: boolean), in_file(vch_val, '../../data/files/test2.dat') (type: boolean), in_file(str_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(ch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file(vch_val_neg, '../../data/files/test2.dat') (type: boolean), in_file('303', '../../data/files/test2.dat') (type: boolean), in_file('304', '../../data/files/test2.dat') (type: boolean), in_file(null, '../../data/files/test2.dat') (type: boolean)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE
Limit
http://git-wip-us.apache.org/repos/asf/hive/blob/1c80f2e5/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_nondeterministic.q.out b/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
new file mode 100644
index 0000000..eef5555
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_nondeterministic.q.out
@@ -0,0 +1,120 @@
+PREHOOK: query: drop table if exists udf_tb1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists udf_tb1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists udf_tb2
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists udf_tb2
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table udf_tb1 (year int, month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@udf_tb1
+POSTHOOK: query: create table udf_tb1 (year int, month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@udf_tb1
+PREHOOK: query: create table udf_tb2(month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@udf_tb2
+POSTHOOK: query: create table udf_tb2(month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@udf_tb2
+PREHOOK: query: insert into udf_tb1 values(2001, 11)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@udf_tb1
+POSTHOOK: query: insert into udf_tb1 values(2001, 11)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@udf_tb1
+POSTHOOK: Lineage: udf_tb1.month EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: udf_tb1.year EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: insert into udf_tb2 values(11)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__2
+PREHOOK: Output: default@udf_tb2
+POSTHOOK: query: insert into udf_tb2 values(11)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__2
+POSTHOOK: Output: default@udf_tb2
+POSTHOOK: Lineage: udf_tb2.month EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: udf_tb1
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: ((year = 2001) and month is not null) (type: boolean)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: month (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ TableScan
+ alias: b
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: month is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: month (type: int)
+ sort order: +
+ Map-reduce partition columns: month (type: int)
+ Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 month (type: int)
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: unix_timestamp('2001-01-01 00:00:00') (type: bigint)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 7 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@udf_tb1
+PREHOOK: Input: default@udf_tb2
+#### A masked pattern was here ####
+POSTHOOK: query: select unix_timestamp(concat(a.year, '-01-01 00:00:00')) from (select * from udf_tb1 where year=2001) a join udf_tb2 b on (a.month=b.month)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@udf_tb1
+POSTHOOK: Input: default@udf_tb2
+#### A masked pattern was here ####
+978336000