You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2022/09/22 11:51:58 UTC

[hive] branch master updated: HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)

This is an automated email from the ASF dual-hosted git repository.

szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new a017e54c98c HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)
a017e54c98c is described below

commit a017e54c98c76ccf0c185b47533b336b0a398dc7
Author: ghanko <54...@users.noreply.github.com>
AuthorDate: Thu Sep 22 13:51:47 2022 +0200

    HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita)
---
 .../hive/ql/exec/vector/VectorizationContext.java  |  3 +
 .../expressions/FilterStructColumnInList.java      | 11 +++-
 .../exec/vector/expressions/VectorExpression.java  | 69 +++++++++-------------
 .../hadoop/hive/ql/udf/generic/GenericUDFIn.java   |  9 ++-
 .../queries/clientpositive/vector_struct_in2.q     | 26 ++++++++
 .../clientpositive/llap/vector_struct_in2.q.out    | 62 +++++++++++++++++++
 6 files changed, 137 insertions(+), 43 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index b5b0b764cd0..ba980f9375c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -2834,6 +2834,9 @@ import com.google.common.annotations.VisibleForTesting;
   private VectorExpression getInExpression(List<ExprNodeDesc> childExpr,
       VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
     ExprNodeDesc colExpr = childExpr.get(0);
+    if (colExpr instanceof ExprNodeConstantDesc) {
+      return null;
+    }
     List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size());
 
     String colType = colExpr.getTypeString();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
index ca1bf4291ea..115e7fab229 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java
@@ -18,7 +18,10 @@
 
 package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
@@ -53,7 +56,7 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement
   /**
    * After construction you must call setInListValues() to add the values to the IN set
    * (on the IStringInExpr interface).
-   *
+   * <p>
    * And, call a and b on the IStructInExpr interface.
    */
   public FilterStructColumnInList() {
@@ -174,4 +177,10 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement
         ", structColumnMap " + Arrays.toString(structColumnMap);
   }
 
+  protected Collection<VectorExpression> getChildExpressionsForTransientInit() {
+    Collection<VectorExpression> result = new ArrayList<>(super.getChildExpressionsForTransientInit());
+    Collections.addAll(result, structExpressions);
+    return result;
+  }
+
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
index 1aa7397b738..800a96f4eb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java
@@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
 import java.io.Serializable;
 import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
-import java.util.List;
+import java.util.Deque;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
@@ -36,32 +38,32 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
 
 /**
  * Base class for vector expressions.
- *
+ * <p>
  * A vector expression is a vectorized execution tree that evaluates the same result as a (row-mode)
  * ExprNodeDesc tree describes.
- *
+ * <p>
  * A vector expression has 0, 1, or more parameters and an optional output column.  These are
  * normally passed to the vector expression object' constructor.  A few special case classes accept
  * extra parameters via set* method.
- *
+ * <p>
  * A ExprNodeColumnDesc vectorizes to the IdentityExpression class where the input column number
  * parameter is the same as the output column number.
- *
+ * <p>
  * A ExprNodeGenericFuncDesc's generic function can vectorize to many different vectorized objects
  * depending on the parameter expression kinds (column, constant, etc) and data types.  Each
  * vectorized class implements the getDecription which indicates the particular expression kind
  * and data type specialization that class is designed for.  The Description is used by the
  * VectorizationContext class in matching the right vectorized class.
- *
+ * <p>
  * The constructor parameters need to be in the same order as the generic function because
  * the VectorizationContext class automates parameter generation and object construction.
- *
+ * <p>
  * Type information is remembered for the input parameters and the output type.
- *
+ * <p>
  * A vector expression has optional children vector expressions when 1 or more parameters need
  * to be calculated into vector scratch columns.  Columns and constants do not need children
  * expressions.
- *
+ * <p>
  * HOW TO to extend VectorExpression (some basic steps and hints):
  * 1. Create a subclass, and write a proper getDescriptor() (column/scalar?, number for args?, etc.)
  * 2. Define an explicit parameterless constructor
@@ -78,7 +80,7 @@ public abstract class VectorExpression implements Serializable {
 
   /**
    * Child expressions for parameters -- but only those that need to be computed.
-   *
+   * <p>
    * NOTE: Columns and constants are not included in the children.  That is: column numbers and
    * scalar values are passed via the constructor and remembered by the individual vector expression
    * classes. They are not represented in the children.
@@ -88,7 +90,7 @@ public abstract class VectorExpression implements Serializable {
   /**
    * ALL input parameter type information is here including those for (non-computed) columns and
    * scalar values.
-   *
+   * <p>
    * The vectorExpressionParameters() method is used to get the displayable string for the
    * parameters used by EXPLAIN, logging, etc.
    */
@@ -106,7 +108,7 @@ public abstract class VectorExpression implements Serializable {
   /**
    * Input column numbers of the vector expression, which should be reused by vector expressions.
    */
-  public int inputColumnNum[] = {-1, -1, -1};
+  public int[] inputColumnNum = {-1, -1, -1};
 
   /*
    * Use this constructor when there is NO output column.
@@ -125,9 +127,6 @@ public abstract class VectorExpression implements Serializable {
 
   /**
    * Constructor for 1 input column and 1 output column.
-   *
-   * @param inputColumnNum
-   * @param outputColumnNum
    */
   public VectorExpression(int inputColumnNum, int outputColumnNum) {
     this(inputColumnNum, -1, -1, outputColumnNum);
@@ -135,10 +134,6 @@ public abstract class VectorExpression implements Serializable {
 
   /**
    * Constructor for 2 input columns and 1 output column.
-   *
-   * @param inputColumnNum
-   * @param inputColumnNum2
-   * @param outputColumnNum
    */
   public VectorExpression(int inputColumnNum, int inputColumnNum2, int outputColumnNum) {
     this(inputColumnNum, inputColumnNum2, -1, outputColumnNum);
@@ -148,11 +143,6 @@ public abstract class VectorExpression implements Serializable {
    * Constructor for 3 input columns and 1 output column. Currently, VectorExpression is initialized
    * for a maximum of 3 input columns. In case an expression with more than 3 columns wants to reuse
    * logic here, inputColumnNum* fields should be extended.
-   *
-   * @param inputColumnNum
-   * @param inputColumnNum2
-   * @param inputColumnNum3
-   * @param outputColumnNum
    */
   public VectorExpression(int inputColumnNum, int inputColumnNum2, int inputColumnNum3, int outputColumnNum) {
     // By default, no children or inputs.
@@ -172,10 +162,8 @@ public abstract class VectorExpression implements Serializable {
 
   /**
    * Convenience method for expressions that uses arbitrary number of input columns in an array.
-   * @param inputColumnNum
-   * @param outputColumnNum
    */
-  public VectorExpression(int inputColumnNum[], int outputColumnNum) {
+  public VectorExpression(int[] inputColumnNum, int outputColumnNum) {
     // By default, no children or inputs.
     childExpressions = null;
     inputTypeInfos = null;
@@ -200,6 +188,14 @@ public abstract class VectorExpression implements Serializable {
     return childExpressions;
   }
 
+  protected Collection<VectorExpression> getChildExpressionsForTransientInit() {
+    if (getChildExpressions() != null) {
+      return Arrays.asList(getChildExpressions());
+    } else {
+      return Collections.emptyList();
+    }
+  }
+
   //------------------------------------------------------------------------------------------------
 
   public void setInputTypeInfos(TypeInfo ...inputTypeInfos) {
@@ -251,17 +247,12 @@ public abstract class VectorExpression implements Serializable {
 
     // Well, don't recurse but make sure all children are initialized.
     vecExpr.transientInit(conf);
-    List<VectorExpression> newChildren = new ArrayList<VectorExpression>();
-    VectorExpression[] children = vecExpr.getChildExpressions();
-    if (children != null) {
-      Collections.addAll(newChildren, children);
-    }
+
+    Deque<VectorExpression> newChildren = new ArrayDeque<>(vecExpr.getChildExpressionsForTransientInit());
+
     while (!newChildren.isEmpty()) {
-      VectorExpression childVecExpr = newChildren.remove(0);
-      children = childVecExpr.getChildExpressions();
-      if (children != null) {
-        Collections.addAll(newChildren, children);
-      }
+      VectorExpression childVecExpr = newChildren.removeFirst();
+      newChildren.addAll(childVecExpr.getChildExpressionsForTransientInit());
       childVecExpr.transientInit(conf);
     }
   }
@@ -309,8 +300,6 @@ public abstract class VectorExpression implements Serializable {
   }
   /**
    * This is the primary method to implement expression logic.
-   * @param batch
-   * @throws HiveException 
    */
   public abstract void evaluate(VectorizedRowBatch batch) throws HiveException;
 
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
index 24852e1b728..fcfb9c7cb04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java
@@ -184,8 +184,13 @@ public class GenericUDFIn extends GenericUDF {
         break;
       }
       case STRUCT: {
-        if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(conversionHelper
-           .convertIfNecessary(arguments[0].get(), argumentOIs[0])))) {
+        Object value;
+        if (argumentOIs[0] instanceof ConstantObjectInspector) {
+          value = ((ConstantObjectInspector) argumentOIs[0]).getWritableConstantValue();
+        } else {
+          value = conversionHelper.convertIfNecessary(arguments[0].get(), argumentOIs[0]);
+        }
+        if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(value))) {
           bw.set(true);
           return bw;
         }
diff --git a/ql/src/test/queries/clientpositive/vector_struct_in2.q b/ql/src/test/queries/clientpositive/vector_struct_in2.q
new file mode 100644
index 00000000000..8c5729240cd
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_struct_in2.q
@@ -0,0 +1,26 @@
+set hive.fetch.task.conversion=none;
+
+create table test (a string) partitioned by (y string, m string);
+insert into test values ('aa', 2022, 9);
+
+--=== original bug report, complex query ===============================================================================
+select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) );
+
+
+--=== simple test cases for the distinct causes of the failure of the complex query ====================================
+
+--this is needed not to optimize away the problematic parts of the queries
+set hive.cbo.enable=false;
+
+--embedded expression in struct - used to yield empty result
+select * from test where (struct(cast(y as int)) IN (struct(2022)));
+
+--first argument of in expression is const struct - used to yield empty result
+select * from test where (struct(2022) IN (struct(2022)));
+
+--these are needed not to optimize away the problematic part of the query
+set hive.optimize.constant.propagation=false;
+set hive.optimize.ppd=false;
+
+--first argument of in expression is const primitive - used to cause error
+select * from test where (2022 IN (2022));
diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out
new file mode 100644
index 00000000000..a35c426d68a
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out
@@ -0,0 +1,62 @@
+PREHOOK: query: create table test (a string) partitioned by (y string, m string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create table test (a string) partitioned by (y string, m string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: insert into test values ('aa', 2022, 9)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: insert into test values ('aa', 2022, 9)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Output: default@test@y=2022/m=9
+POSTHOOK: Lineage: test PARTITION(y=2022,m=9).a SCRIPT []
+PREHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) )
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa	2022	9
+PREHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022)))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa	2022	9
+PREHOOK: query: select * from test where (struct(2022) IN (struct(2022)))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (struct(2022) IN (struct(2022)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa	2022	9
+PREHOOK: query: select * from test where (2022 IN (2022))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where (2022 IN (2022))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@y=2022/m=9
+#### A masked pattern was here ####
+aa	2022	9