You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/10/29 17:11:29 UTC

svn commit: r1536787 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/ java/org/apache/hadoop/hive/ql/optimizer/physical/ test/org/apache/hadoop/hive/ql/optimizer/physical/

Author: hashutosh
Date: Tue Oct 29 16:11:28 2013
New Revision: 1536787

URL: http://svn.apache.org/r1536787
Log:
HIVE-5604 : Fix validation of nested expressions. (Jitendra Nath Pandey via Ashutosh Chauhan)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java Tue Oct 29 16:11:28 2013
@@ -21,10 +21,14 @@ package org.apache.hadoop.hive.ql.exec.v
 import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 
-import java.util.Arrays;
-
+/**
+ * Describes a vector expression and encapsulates the {@link Mode}, number of arguments,
+ * argument types {@link ArgumentType} and expression types {@link InputExpressionType}.
+ */
 public class VectorExpressionDescriptor {
 
+  final static int MAX_NUM_ARGUMENTS = 3;
+
   public enum ArgumentType {
     NONE(0),
     LONG(1),
@@ -79,39 +83,20 @@ public class VectorExpressionDescriptor 
   }
 
   /**
-   * Each vector expression has a bitmap that determines the kind or a classification for
-   * the expression. Following parameters are used to identify the kind of an expression.
-   * <ol>
-   * <li>The expression produces an output column (projection) or does in-place filtering
-   *     (filter).</li>
-   * <li>Number if arguments the expression takes (unary, binary etc). For now we assume maximum 3
-   *     arguments.</li>
-   * <li>Types of each argument (long/double/string)</li>
-   * <li>The input to the expression is a column or a scalar.</li>
-   * </ol>
-   * The bitmap consists of 18 bits:
-   *   <ul>
-   *   <li>1 bit for filter/projection.
-   *   <li>2 bits for number of input arguments.
-   *   <li>3 bits for each argument type. Total 9 bits for maximum 3 arguments. For unary
-   *       expressions only first 3 bits are set, rest of the 6 bits are set to 0.
-   *   <li>2 bits to encode whether argument is a column or scalar. Total 6 bits for each argument.
-   *   <ul>
+   * Builder builds a {@link Descriptor} object. Setter methods are provided to set the {@link Mode}, number
+   * of arguments, argument types and expression types for each argument.
    */
   public static class Builder {
     private Mode mode = Mode.PROJECTION;
-    private final int maxNumArguments = 3;
-    ArgumentType [] argTypes = new ArgumentType[maxNumArguments];
-    InputExpressionType [] exprTypes = new InputExpressionType[maxNumArguments];
+    ArgumentType [] argTypes = new ArgumentType[MAX_NUM_ARGUMENTS];
+    InputExpressionType [] exprTypes = new InputExpressionType[MAX_NUM_ARGUMENTS];
     private int argCount = 0;
 
     public Builder() {
-      argTypes[0] = ArgumentType.NONE;
-      argTypes[1] = ArgumentType.NONE;
-      argTypes[2] = ArgumentType.NONE;
-      exprTypes[0] = InputExpressionType.NONE;
-      exprTypes[1] = InputExpressionType.NONE;
-      exprTypes[2] = InputExpressionType.NONE;
+      for (int i = 0 ; i < MAX_NUM_ARGUMENTS; i++) {
+        argTypes[i] = ArgumentType.NONE;
+        exprTypes[i] = InputExpressionType.NONE;
+      }
     }
 
     public Builder setMode(Mode m) {
@@ -166,7 +151,8 @@ public class VectorExpressionDescriptor 
   }
 
   /**
-   * Descriptor is immutable and is constructed by the {@link Builder} only.
+   * Descriptor is immutable and is constructed by the {@link Builder} only. {@link #equals(Object)} is the only
+   * publicly exposed member which can be used to compare two descriptors.
    */
   public static final class Descriptor {
 

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java Tue Oct 29 16:11:28 2013
@@ -124,14 +124,14 @@ public class VectorizationContext {
     this.fileKey = fileKey;
   }
 
-  private int getInputColumnIndex(String name) {
-      if (!columnMap.containsKey(name)) {
-        LOG.error(String.format("The column %s is not in the vectorization context column map.", name));
-      }
-      return columnMap.get(name);
+  protected int getInputColumnIndex(String name) {
+    if (!columnMap.containsKey(name)) {
+      LOG.error(String.format("The column %s is not in the vectorization context column map.", name));
+    }
+    return columnMap.get(name);
   }
 
-  private int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
+  protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
     return columnMap.get(colExpr.getColumn());
   }
 
@@ -139,7 +139,7 @@ public class VectorizationContext {
     private final int initialOutputCol;
     private int outputColCount = 0;
 
-    OutputColumnManager(int initialOutputCol) {
+    protected OutputColumnManager(int initialOutputCol) {
       this.initialOutputCol = initialOutputCol;
     }
 
@@ -152,6 +152,10 @@ public class VectorizationContext {
     private final Set<Integer> usedOutputColumns = new HashSet<Integer>();
 
     int allocateOutputColumn(String columnType) {
+      if (initialOutputCol < 0) {
+        // This is a test
+        return 0;
+      }
       int relativeCol = allocateOutputColumnInternal(columnType);
       return initialOutputCol + relativeCol;
     }
@@ -183,6 +187,10 @@ public class VectorizationContext {
     }
 
     void freeOutputColumn(int index) {
+      if (initialOutputCol < 0) {
+        // This is a test
+        return;
+      }
       int colIndex = index-initialOutputCol;
       if (colIndex >= 0) {
         usedOutputColumns.remove(index-initialOutputCol);
@@ -423,6 +431,9 @@ public class VectorizationContext {
   private VectorExpression getVectorExpressionForUdf(Class<?> udf, List<ExprNodeDesc> childExpr, Mode mode)
       throws HiveException {
     int numChildren = (childExpr == null) ? 0 : childExpr.size();
+    if (numChildren > VectorExpressionDescriptor.MAX_NUM_ARGUMENTS) {
+      return null;
+    }
     VectorExpressionDescriptor.Builder builder = new VectorExpressionDescriptor.Builder();
     builder.setNumArguments(numChildren);
     builder.setMode(mode);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Tue Oct 29 16:11:28 2013
@@ -46,6 +46,7 @@ import org.apache.hadoop.hive.ql.exec.Ta
 import org.apache.hadoop.hive.ql.exec.Task;
 import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
 import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
@@ -63,66 +64,9 @@ import org.apache.hadoop.hive.ql.metadat
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.parse.RowResolver;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
-import org.apache.hadoop.hive.ql.plan.AggregationDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
-import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
-import org.apache.hadoop.hive.ql.plan.MapWork;
-import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.*;
 import org.apache.hadoop.hive.ql.plan.api.OperatorType;
-import org.apache.hadoop.hive.ql.udf.UDFAcos;
-import org.apache.hadoop.hive.ql.udf.UDFAsin;
-import org.apache.hadoop.hive.ql.udf.UDFAtan;
-import org.apache.hadoop.hive.ql.udf.UDFBin;
-import org.apache.hadoop.hive.ql.udf.UDFCeil;
-import org.apache.hadoop.hive.ql.udf.UDFConv;
-import org.apache.hadoop.hive.ql.udf.UDFCos;
-import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth;
-import org.apache.hadoop.hive.ql.udf.UDFDegrees;
-import org.apache.hadoop.hive.ql.udf.UDFExp;
-import org.apache.hadoop.hive.ql.udf.UDFFloor;
-import org.apache.hadoop.hive.ql.udf.UDFHex;
-import org.apache.hadoop.hive.ql.udf.UDFHour;
-import org.apache.hadoop.hive.ql.udf.UDFLTrim;
-import org.apache.hadoop.hive.ql.udf.UDFLength;
-import org.apache.hadoop.hive.ql.udf.UDFLike;
-import org.apache.hadoop.hive.ql.udf.UDFLn;
-import org.apache.hadoop.hive.ql.udf.UDFLog;
-import org.apache.hadoop.hive.ql.udf.UDFLog10;
-import org.apache.hadoop.hive.ql.udf.UDFLog2;
-import org.apache.hadoop.hive.ql.udf.UDFMinute;
-import org.apache.hadoop.hive.ql.udf.UDFOPDivide;
-import org.apache.hadoop.hive.ql.udf.UDFOPMinus;
-import org.apache.hadoop.hive.ql.udf.UDFOPMod;
-import org.apache.hadoop.hive.ql.udf.UDFOPMultiply;
-import org.apache.hadoop.hive.ql.udf.UDFOPNegative;
-import org.apache.hadoop.hive.ql.udf.UDFOPPlus;
-import org.apache.hadoop.hive.ql.udf.UDFOPPositive;
-import org.apache.hadoop.hive.ql.udf.UDFPosMod;
-import org.apache.hadoop.hive.ql.udf.UDFPower;
-import org.apache.hadoop.hive.ql.udf.UDFRTrim;
-import org.apache.hadoop.hive.ql.udf.UDFRadians;
-import org.apache.hadoop.hive.ql.udf.UDFRand;
-import org.apache.hadoop.hive.ql.udf.UDFRound;
-import org.apache.hadoop.hive.ql.udf.UDFSecond;
-import org.apache.hadoop.hive.ql.udf.UDFSign;
-import org.apache.hadoop.hive.ql.udf.UDFSin;
-import org.apache.hadoop.hive.ql.udf.UDFSqrt;
-import org.apache.hadoop.hive.ql.udf.UDFSubstr;
-import org.apache.hadoop.hive.ql.udf.UDFTan;
-import org.apache.hadoop.hive.ql.udf.UDFToBoolean;
-import org.apache.hadoop.hive.ql.udf.UDFToByte;
-import org.apache.hadoop.hive.ql.udf.UDFToDouble;
-import org.apache.hadoop.hive.ql.udf.UDFToFloat;
-import org.apache.hadoop.hive.ql.udf.UDFToInteger;
-import org.apache.hadoop.hive.ql.udf.UDFToLong;
-import org.apache.hadoop.hive.ql.udf.UDFToShort;
-import org.apache.hadoop.hive.ql.udf.UDFToString;
-import org.apache.hadoop.hive.ql.udf.UDFTrim;
-import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear;
-import org.apache.hadoop.hive.ql.udf.UDFYear;
+import org.apache.hadoop.hive.ql.udf.*;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
@@ -201,6 +145,7 @@ public class Vectorizer implements Physi
     supportedGenericUDFs.add(UDFDayOfMonth.class);
 
     supportedGenericUDFs.add(UDFLike.class);
+    supportedGenericUDFs.add(UDFRegExp.class);
     supportedGenericUDFs.add(UDFSubstr.class);
     supportedGenericUDFs.add(UDFLTrim.class);
     supportedGenericUDFs.add(UDFRTrim.class);
@@ -352,7 +297,7 @@ public class Vectorizer implements Physi
         }
         boolean ret = validateOperator(op);
         if (!ret) {
-          LOG.info("Operator: "+op.getName()+" could not be vectorized.");
+          LOG.info("Operator: " + op.getName() + " could not be vectorized.");
           return new Boolean(false);
         }
       }
@@ -451,6 +396,22 @@ public class Vectorizer implements Physi
     }
   }
 
+  private static class ValidatorVectorizationContext extends VectorizationContext {
+    private ValidatorVectorizationContext() {
+      super(null, -1);
+    }
+
+    @Override
+    protected int getInputColumnIndex(String name) {
+      return 0;
+    }
+
+    @Override
+    protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
+      return 0;
+    }
+  }
+
   @Override
   public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
     this.physicalContext  = pctx;
@@ -509,7 +470,7 @@ public class Vectorizer implements Physi
     List<ExprNodeDesc> filterExprs = desc.getFilters().get(posBigTable);
     List<ExprNodeDesc> keyExprs = desc.getKeys().get(posBigTable);
     List<ExprNodeDesc> valueExprs = desc.getExprs().get(posBigTable);
-    return validateExprNodeDesc(filterExprs) &&
+    return validateExprNodeDesc(filterExprs, VectorExpressionDescriptor.Mode.FILTER) &&
         validateExprNodeDesc(keyExprs) &&
         validateExprNodeDesc(valueExprs);
   }
@@ -535,7 +496,7 @@ public class Vectorizer implements Physi
 
   private boolean validateFilterOperator(FilterOperator op) {
     ExprNodeDesc desc = op.getConf().getPredicate();
-    return validateExprNodeDesc(desc);
+    return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.FILTER);
   }
 
   private boolean validateGroupByOperator(GroupByOperator op) {
@@ -547,8 +508,12 @@ public class Vectorizer implements Physi
   }
 
   private boolean validateExprNodeDesc(List<ExprNodeDesc> descs) {
+    return validateExprNodeDesc(descs, VectorExpressionDescriptor.Mode.PROJECTION);
+  }
+
+  private boolean validateExprNodeDesc(List<ExprNodeDesc> descs, VectorExpressionDescriptor.Mode mode) {
     for (ExprNodeDesc d : descs) {
-      boolean ret = validateExprNodeDesc(d);
+      boolean ret = validateExprNodeDesc(d, mode);
       if (!ret) {
         return false;
       }
@@ -566,7 +531,7 @@ public class Vectorizer implements Physi
     return true;
   }
 
-  private boolean validateExprNodeDesc(ExprNodeDesc desc) {
+  private boolean validateExprNodeDescRecursive(ExprNodeDesc desc) {
     boolean ret = validateDataType(desc.getTypeInfo().getTypeName());
     if (!ret) {
       return false;
@@ -580,12 +545,37 @@ public class Vectorizer implements Physi
     }
     if (desc.getChildren() != null) {
       for (ExprNodeDesc d: desc.getChildren()) {
-        validateExprNodeDesc(d);
+        boolean r = validateExprNodeDescRecursive(d);
+        if (!r) {
+          return false;
+        }
       }
     }
     return true;
   }
 
+  private boolean validateExprNodeDesc(ExprNodeDesc desc) {
+    return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.PROJECTION);
+  }
+
+  boolean validateExprNodeDesc(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) {
+    if (!validateExprNodeDescRecursive(desc)) {
+      return false;
+    }
+    try {
+      VectorizationContext vc = new ValidatorVectorizationContext();
+      if (vc.getVectorExpression(desc, mode) == null) {
+        return false;
+      }
+    } catch (HiveException e) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Failed to vectorize", e);
+      }
+      return false;
+    }
+    return true;
+  }
+
   private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) {
     if (VectorizationContext.isCustomUDF(genericUDFExpr)) {
       return true;

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java Tue Oct 29 16:11:28 2013
@@ -25,18 +25,18 @@ import java.util.Map;
 
 import junit.framework.Assert;
 
+import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
 import org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFSumLong;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FuncAbsLongToLong;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.plan.AggregationDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
-import org.apache.hadoop.hive.ql.plan.GroupByDesc;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs;
+import org.apache.hadoop.hive.ql.plan.*;
+import org.apache.hadoop.hive.ql.udf.generic.*;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.junit.Before;
 import org.junit.Test;
@@ -50,9 +50,28 @@ public class TestVectorizer {
     Map<String, Integer> columnMap = new HashMap<String, Integer>();
     columnMap.put("col1", 0);
     columnMap.put("col2", 1);
+    columnMap.put("col3", 2);
 
     //Generate vectorized expression
-    vContext = new VectorizationContext(columnMap, 2);
+    vContext = new VectorizationContext(columnMap, 3);
+  }
+
+  @Description(name = "fake", value = "FAKE")
+  static class FakeGenericUDF extends GenericUDF {
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+      return null;
+    }
+
+    @Override
+    public Object evaluate(DeferredObject[] arguments) throws HiveException {
+      return null;
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+      return "fake";
+    }
   }
 
   @Test
@@ -96,4 +115,37 @@ public class TestVectorizer {
     VectorUDAFSumLong udaf = (VectorUDAFSumLong) vectorOp.getAggregators()[0];
     Assert.assertEquals(FuncAbsLongToLong.class, udaf.getInputExpression().getClass());
   }
+
+  @Test
+  public void testValidateNestedExpressions() {
+    ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(Integer.class, "col1", "table", false);
+    ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc(new Integer(10));
+
+    GenericUDFOPGreaterThan udf = new GenericUDFOPGreaterThan();
+    ExprNodeGenericFuncDesc greaterExprDesc = new ExprNodeGenericFuncDesc();
+    greaterExprDesc.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+    greaterExprDesc.setGenericUDF(udf);
+    List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>(2);
+    children1.add(col1Expr);
+    children1.add(constDesc);
+    greaterExprDesc.setChildren(children1);
+
+    FakeGenericUDF udf2 = new FakeGenericUDF();
+    ExprNodeGenericFuncDesc nonSupportedExpr = new ExprNodeGenericFuncDesc();
+    nonSupportedExpr.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+    nonSupportedExpr.setGenericUDF(udf2);
+
+    GenericUDFOPAnd andUdf = new GenericUDFOPAnd();
+    ExprNodeGenericFuncDesc andExprDesc = new ExprNodeGenericFuncDesc();
+    andExprDesc.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+    andExprDesc.setGenericUDF(andUdf);
+    List<ExprNodeDesc> children3 = new ArrayList<ExprNodeDesc>(2);
+    children3.add(greaterExprDesc);
+    children3.add(nonSupportedExpr);
+    andExprDesc.setChildren(children3);
+
+    Vectorizer v = new Vectorizer();
+    Assert.assertFalse(v.validateExprNodeDesc(andExprDesc, VectorExpressionDescriptor.Mode.FILTER));
+    Assert.assertFalse(v.validateExprNodeDesc(andExprDesc, VectorExpressionDescriptor.Mode.PROJECTION));
+  }
 }