You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2022/09/22 11:51:58 UTC
[hive] branch master updated: HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)
This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new a017e54c98c HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)
a017e54c98c is described below
commit a017e54c98c76ccf0c185b47533b336b0a398dc7
Author: ghanko <54...@users.noreply.github.com>
AuthorDate: Thu Sep 22 13:51:47 2022 +0200
HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)
---
.../hive/ql/exec/vector/VectorizationContext.java | 3 +
.../expressions/FilterStructColumnInList.java | 11 +++-
.../exec/vector/expressions/VectorExpression.java | 69 +++++++++-------------
.../hadoop/hive/ql/udf/generic/GenericUDFIn.java | 9 ++-
.../queries/clientpositive/vector_struct_in2.q | 26 ++++++++
.../clientpositive/llap/vector_struct_in2.q.out | 62 +++++++++++++++++++
6 files changed, 137 insertions(+), 43 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index b5b0b764cd0..ba980f9375c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -2834,6 +2834,9 @@ import com.google.common.annotations.VisibleForTesting;
private VectorExpression getInExpression(List<ExprNodeDesc> childExpr,
VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
ExprNodeDesc colExpr = childExpr.get(0);
+ if (colExpr instanceof ExprNodeConstantDesc) {
+ return null;
+ }
List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size());
String colType = colExpr.getTypeString();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
index ca1bf4291ea..115e7fab229 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
@@ -18,7 +18,10 @@
package org.apache.hadoop.hive.ql.exec.vector.expressions;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
@@ -53,7 +56,7 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement
/**
* After construction you must call setInListValues() to add the values to the IN set
* (on the IStringInExpr interface).
- *
+ * <p>
* And, call a and b on the IStructInExpr interface.
*/
public FilterStructColumnInList() {
@@ -174,4 +177,10 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement
", structColumnMap " + Arrays.toString(structColumnMap);
}
+ protected Collection<VectorExpression> getChildExpressionsForTransientInit() {
+ Collection<VectorExpression> result = new ArrayList<>(super.getChildExpressionsForTransientInit());
+ Collections.addAll(result, structExpressions);
+ return result;
+ }
+
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
index 1aa7397b738..800a96f4eb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
@@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
-import java.util.List;
+import java.util.Deque;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
@@ -36,32 +38,32 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
/**
* Base class for vector expressions.
- *
+ * <p>
* A vector expression is a vectorized execution tree that evaluates the same result as a (row-mode)
* ExprNodeDesc tree describes.
- *
+ * <p>
* A vector expression has 0, 1, or more parameters and an optional output column. These are
* normally passed to the vector expression object' constructor. A few special case classes accept
* extra parameters via set* method.
- *
+ * <p>
* A ExprNodeColumnDesc vectorizes to the IdentityExpression class where the input column number
* parameter is the same as the output column number.
- *
+ * <p>
* A ExprNodeGenericFuncDesc's generic function can vectorize to many different vectorized objects
* depending on the parameter expression kinds (column, constant, etc) and data types. Each
* vectorized class implements the getDecription which indicates the particular expression kind
* and data type specialization that class is designed for. The Description is used by the
* VectorizationContext class in matching the right vectorized class.
- *
+ * <p>
* The constructor parameters need to be in the same order as the generic function because
* the VectorizationContext class automates parameter generation and object construction.
- *
+ * <p>
* Type information is remembered for the input parameters and the output type.
- *
+ * <p>
* A vector expression has optional children vector expressions when 1 or more parameters need
* to be calculated into vector scratch columns. Columns and constants do not need children
* expressions.
- *
+ * <p>
* HOW TO to extend VectorExpression (some basic steps and hints):
* 1. Create a subclass, and write a proper getDescriptor() (column/scalar?, number for args?, etc.)
* 2. Define an explicit parameterless constructor
@@ -78,7 +80,7 @@ public abstract class VectorExpression implements Serializable {
/**
* Child expressions for parameters -- but only those that need to be computed.
- *
+ * <p>
* NOTE: Columns and constants are not included in the children. That is: column numbers and
* scalar values are passed via the constructor and remembered by the individual vector expression
* classes. They are not represented in the children.
@@ -88,7 +90,7 @@ public abstract class VectorExpression implements Serializable {
/**
* ALL input parameter type information is here including those for (non-computed) columns and
* scalar values.
- *
+ * <p>
* The vectorExpressionParameters() method is used to get the displayable string for the
* parameters used by EXPLAIN, logging, etc.
*/
@@ -106,7 +108,7 @@ public abstract class VectorExpression implements Serializable {
/**
* Input column numbers of the vector expression, which should be reused by vector expressions.
*/
- public int inputColumnNum[] = {-1, -1, -1};
+ public int[] inputColumnNum = {-1, -1, -1};
/*
* Use this constructor when there is NO output column.
@@ -125,9 +127,6 @@ public abstract class VectorExpression implements Serializable {
/**
* Constructor for 1 input column and 1 output column.
- *
- * @param inputColumnNum
- * @param outputColumnNum
*/
public VectorExpression(int inputColumnNum, int outputColumnNum) {
this(inputColumnNum, -1, -1, outputColumnNum);
@@ -135,10 +134,6 @@ public abstract class VectorExpression implements Serializable {
/**
* Constructor for 2 input columns and 1 output column.
- *
- * @param inputColumnNum
- * @param inputColumnNum2
- * @param outputColumnNum
*/
public VectorExpression(int inputColumnNum, int inputColumnNum2, int outputColumnNum) {
this(inputColumnNum, inputColumnNum2, -1, outputColumnNum);
@@ -148,11 +143,6 @@ public abstract class VectorExpression implements Serializable {
* Constructor for 3 input columns and 1 output column. Currently, VectorExpression is initialized
* for a maximum of 3 input columns. In case an expression with more than 3 columns wants to reuse
* logic here, inputColumnNum* fields should be extended.
- *
- * @param inputColumnNum
- * @param inputColumnNum2
- * @param inputColumnNum3
- * @param outputColumnNum
*/
public VectorExpression(int inputColumnNum, int inputColumnNum2, int inputColumnNum3, int outputColumnNum) {
// By default, no children or inputs.
@@ -172,10 +162,8 @@ public abstract class VectorExpression implements Serializable {
/**
* Convenience method for expressions that uses arbitrary number of input columns in an array.
- * @param inputColumnNum
- * @param outputColumnNum
*/
- public VectorExpression(int inputColumnNum[], int outputColumnNum) {
+ public VectorExpression(int[] inputColumnNum, int outputColumnNum) {
// By default, no children or inputs.
childExpressions = null;
inputTypeInfos = null;
@@ -200,6 +188,14 @@ public abstract class VectorExpression implements Serializable {
return childExpressions;
}
+ protected Collection<VectorExpression> getChildExpressionsForTransientInit() {
+ if (getChildExpressions() != null) {
+ return Arrays.asList(getChildExpressions());
+ } else {
+ return Collections.emptyList();
+ }
+ }
+
//------------------------------------------------------------------------------------------------
public void setInputTypeInfos(TypeInfo ...inputTypeInfos) {
@@ -251,17 +247,12 @@ public abstract class VectorExpression implements Serializable {
// Well, don't recurse but make sure all children are initialized.
vecExpr.transientInit(conf);
- List<VectorExpression> newChildren = new ArrayList<VectorExpression>();
- VectorExpression[] children = vecExpr.getChildExpressions();
- if (children != null) {
- Collections.addAll(newChildren, children);
- }
+
+ Deque<VectorExpression> newChildren = new ArrayDeque<>(vecExpr.getChildExpressionsForTransientInit());
+
while (!newChildren.isEmpty()) {
- VectorExpression childVecExpr = newChildren.remove(0);
- children = childVecExpr.getChildExpressions();
- if (children != null) {
- Collections.addAll(newChildren, children);
- }
+ VectorExpression childVecExpr = newChildren.removeFirst();
+ newChildren.addAll(childVecExpr.getChildExpressionsForTransientInit());
childVecExpr.transientInit(conf);
}
}
@@ -309,8 +300,6 @@ public abstract class VectorExpression implements Serializable {
}
/**
* This is the primary method to implement expression logic.
- * @param batch
- * @throws HiveException
*/
public abstract void evaluate(VectorizedRowBatch batch) throws HiveException;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
index 24852e1b728..fcfb9c7cb04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
@@ -184,8 +184,13 @@ public class GenericUDFIn extends GenericUDF {
break;
}
case STRUCT: {
- if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(conversionHelper
- .convertIfNecessary(arguments[0].get(), argumentOIs[0])))) {
+ Object value;
+ if (argumentOIs[0] instanceof ConstantObjectInspector) {
+ value = ((ConstantObjectInspector) argumentOIs[0]).getWritableConstantValue();
+ } else {
+ value = conversionHelper.convertIfNecessary(arguments[0].get(), argumentOIs[0]);
+ }
+ if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(value))) {
bw.set(true);
return bw;
}
diff --git a/ql/src/test/queries/clientpositive/vector_struct_in2.q b/ql/src/test/queries/clientpositive/vector_struct_in2.q
new file mode 100644
index 00000000000..8c5729240cd
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_struct_in2.q
@@ -0,0 +1,26 @@
+set hive.fetch.task.conversion=none;
+
+create table test (a string) partitioned by (y string, m string);
+insert into test values ('aa', 2022, 9);
+
+--=== original bug report, complex query ===============================================================================
+select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) );
+
+
+--=== simple test cases for the distinct causes of the failure of the complex query ====================================
+
+--this is needed not to optimize away the problematic parts of the queries
+set hive.cbo.enable=false;
+
+--embedded expression in struct - used to yield empty result
+select * from test where (struct(cast(y as int)) IN (struct(2022)));
+
+--first argument of in expression is const struct - used to yield empty result
+select * from test where (struct(2022) IN (struct(2022)));
+
+--these are needed not to optimize away the problematic part of the query
+set hive.optimize.constant.propagation=false;
+set hive.optimize.ppd=false;
+
+--first argument of in expression is const primitive - used to cause error
+select * from test where (2022 IN (2022));
diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out
new file mode 100644
index 00000000000..a35c426d68a
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out
@@ -0,0 +1,62 @@
+PREHOOK: query: create table test (a string) partitioned by (y string, m string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create table test (a string) partitioned by (y string, m string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: insert into test values ('aa', 2022, 9)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values ('aa', 2022, 9)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Output: default@test@y=2022/m=9
+POSTHOOK: Lineage: test PARTITION(y=2022,m=9).a SCRIPT []
+PREHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) )
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa 2022 9
+PREHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022)))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa 2022 9
+PREHOOK: query: select * from test where (struct(2022) IN (struct(2022)))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (struct(2022) IN (struct(2022)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa 2022 9
+PREHOOK: query: select * from test where (2022 IN (2022))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (2022 IN (2022))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa 2022 9