You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/10/29 17:11:29 UTC
svn commit: r1536787 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/
java/org/apache/hadoop/hive/ql/optimizer/physical/
test/org/apache/hadoop/hive/ql/optimizer/physical/
Author: hashutosh
Date: Tue Oct 29 16:11:28 2013
New Revision: 1536787
URL: http://svn.apache.org/r1536787
Log:
HIVE-5604 : Fix validation of nested expressions. (Jitendra Nath Pandey via Ashutosh Chauhan)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java Tue Oct 29 16:11:28 2013
@@ -21,10 +21,14 @@ package org.apache.hadoop.hive.ql.exec.v
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.metadata.HiveException;
-import java.util.Arrays;
-
+/**
+ * Describes a vector expression and encapsulates the {@link Mode}, number of arguments,
+ * argument types {@link ArgumentType} and expression types {@link InputExpressionType}.
+ */
public class VectorExpressionDescriptor {
+ final static int MAX_NUM_ARGUMENTS = 3;
+
public enum ArgumentType {
NONE(0),
LONG(1),
@@ -79,39 +83,20 @@ public class VectorExpressionDescriptor
}
/**
- * Each vector expression has a bitmap that determines the kind or a classification for
- * the expression. Following parameters are used to identify the kind of an expression.
- * <ol>
- * <li>The expression produces an output column (projection) or does in-place filtering
- * (filter).</li>
- * <li>Number if arguments the expression takes (unary, binary etc). For now we assume maximum 3
- * arguments.</li>
- * <li>Types of each argument (long/double/string)</li>
- * <li>The input to the expression is a column or a scalar.</li>
- * </ol>
- * The bitmap consists of 18 bits:
- * <ul>
- * <li>1 bit for filter/projection.
- * <li>2 bits for number of input arguments.
- * <li>3 bits for each argument type. Total 9 bits for maximum 3 arguments. For unary
- * expressions only first 3 bits are set, rest of the 6 bits are set to 0.
- * <li>2 bits to encode whether argument is a column or scalar. Total 6 bits for each argument.
- * <ul>
+ * Builder builds a {@link Descriptor} object. Setter methods are provided to set the {@link Mode}, number
+ * of arguments, argument types and expression types for each argument.
*/
public static class Builder {
private Mode mode = Mode.PROJECTION;
- private final int maxNumArguments = 3;
- ArgumentType [] argTypes = new ArgumentType[maxNumArguments];
- InputExpressionType [] exprTypes = new InputExpressionType[maxNumArguments];
+ ArgumentType [] argTypes = new ArgumentType[MAX_NUM_ARGUMENTS];
+ InputExpressionType [] exprTypes = new InputExpressionType[MAX_NUM_ARGUMENTS];
private int argCount = 0;
public Builder() {
- argTypes[0] = ArgumentType.NONE;
- argTypes[1] = ArgumentType.NONE;
- argTypes[2] = ArgumentType.NONE;
- exprTypes[0] = InputExpressionType.NONE;
- exprTypes[1] = InputExpressionType.NONE;
- exprTypes[2] = InputExpressionType.NONE;
+ for (int i = 0 ; i < MAX_NUM_ARGUMENTS; i++) {
+ argTypes[i] = ArgumentType.NONE;
+ exprTypes[i] = InputExpressionType.NONE;
+ }
}
public Builder setMode(Mode m) {
@@ -166,7 +151,8 @@ public class VectorExpressionDescriptor
}
/**
- * Descriptor is immutable and is constructed by the {@link Builder} only.
+ * Descriptor is immutable and is constructed by the {@link Builder} only. {@link #equals(Object)} is the only
+ * publicly exposed member which can be used to compare two descriptors.
*/
public static final class Descriptor {
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java Tue Oct 29 16:11:28 2013
@@ -124,14 +124,14 @@ public class VectorizationContext {
this.fileKey = fileKey;
}
- private int getInputColumnIndex(String name) {
- if (!columnMap.containsKey(name)) {
- LOG.error(String.format("The column %s is not in the vectorization context column map.", name));
- }
- return columnMap.get(name);
+ protected int getInputColumnIndex(String name) {
+ if (!columnMap.containsKey(name)) {
+ LOG.error(String.format("The column %s is not in the vectorization context column map.", name));
+ }
+ return columnMap.get(name);
}
- private int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
+ protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
return columnMap.get(colExpr.getColumn());
}
@@ -139,7 +139,7 @@ public class VectorizationContext {
private final int initialOutputCol;
private int outputColCount = 0;
- OutputColumnManager(int initialOutputCol) {
+ protected OutputColumnManager(int initialOutputCol) {
this.initialOutputCol = initialOutputCol;
}
@@ -152,6 +152,10 @@ public class VectorizationContext {
private final Set<Integer> usedOutputColumns = new HashSet<Integer>();
int allocateOutputColumn(String columnType) {
+ if (initialOutputCol < 0) {
+ // This is a test
+ return 0;
+ }
int relativeCol = allocateOutputColumnInternal(columnType);
return initialOutputCol + relativeCol;
}
@@ -183,6 +187,10 @@ public class VectorizationContext {
}
void freeOutputColumn(int index) {
+ if (initialOutputCol < 0) {
+ // This is a test
+ return;
+ }
int colIndex = index-initialOutputCol;
if (colIndex >= 0) {
usedOutputColumns.remove(index-initialOutputCol);
@@ -423,6 +431,9 @@ public class VectorizationContext {
private VectorExpression getVectorExpressionForUdf(Class<?> udf, List<ExprNodeDesc> childExpr, Mode mode)
throws HiveException {
int numChildren = (childExpr == null) ? 0 : childExpr.size();
+ if (numChildren > VectorExpressionDescriptor.MAX_NUM_ARGUMENTS) {
+ return null;
+ }
VectorExpressionDescriptor.Builder builder = new VectorExpressionDescriptor.Builder();
builder.setNumArguments(numChildren);
builder.setMode(mode);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Tue Oct 29 16:11:28 2013
@@ -46,6 +46,7 @@ import org.apache.hadoop.hive.ql.exec.Ta
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
@@ -63,66 +64,9 @@ import org.apache.hadoop.hive.ql.metadat
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.RowResolver;
import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
-import org.apache.hadoop.hive.ql.plan.AggregationDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
-import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
-import org.apache.hadoop.hive.ql.plan.MapWork;
-import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.*;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
-import org.apache.hadoop.hive.ql.udf.UDFAcos;
-import org.apache.hadoop.hive.ql.udf.UDFAsin;
-import org.apache.hadoop.hive.ql.udf.UDFAtan;
-import org.apache.hadoop.hive.ql.udf.UDFBin;
-import org.apache.hadoop.hive.ql.udf.UDFCeil;
-import org.apache.hadoop.hive.ql.udf.UDFConv;
-import org.apache.hadoop.hive.ql.udf.UDFCos;
-import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth;
-import org.apache.hadoop.hive.ql.udf.UDFDegrees;
-import org.apache.hadoop.hive.ql.udf.UDFExp;
-import org.apache.hadoop.hive.ql.udf.UDFFloor;
-import org.apache.hadoop.hive.ql.udf.UDFHex;
-import org.apache.hadoop.hive.ql.udf.UDFHour;
-import org.apache.hadoop.hive.ql.udf.UDFLTrim;
-import org.apache.hadoop.hive.ql.udf.UDFLength;
-import org.apache.hadoop.hive.ql.udf.UDFLike;
-import org.apache.hadoop.hive.ql.udf.UDFLn;
-import org.apache.hadoop.hive.ql.udf.UDFLog;
-import org.apache.hadoop.hive.ql.udf.UDFLog10;
-import org.apache.hadoop.hive.ql.udf.UDFLog2;
-import org.apache.hadoop.hive.ql.udf.UDFMinute;
-import org.apache.hadoop.hive.ql.udf.UDFOPDivide;
-import org.apache.hadoop.hive.ql.udf.UDFOPMinus;
-import org.apache.hadoop.hive.ql.udf.UDFOPMod;
-import org.apache.hadoop.hive.ql.udf.UDFOPMultiply;
-import org.apache.hadoop.hive.ql.udf.UDFOPNegative;
-import org.apache.hadoop.hive.ql.udf.UDFOPPlus;
-import org.apache.hadoop.hive.ql.udf.UDFOPPositive;
-import org.apache.hadoop.hive.ql.udf.UDFPosMod;
-import org.apache.hadoop.hive.ql.udf.UDFPower;
-import org.apache.hadoop.hive.ql.udf.UDFRTrim;
-import org.apache.hadoop.hive.ql.udf.UDFRadians;
-import org.apache.hadoop.hive.ql.udf.UDFRand;
-import org.apache.hadoop.hive.ql.udf.UDFRound;
-import org.apache.hadoop.hive.ql.udf.UDFSecond;
-import org.apache.hadoop.hive.ql.udf.UDFSign;
-import org.apache.hadoop.hive.ql.udf.UDFSin;
-import org.apache.hadoop.hive.ql.udf.UDFSqrt;
-import org.apache.hadoop.hive.ql.udf.UDFSubstr;
-import org.apache.hadoop.hive.ql.udf.UDFTan;
-import org.apache.hadoop.hive.ql.udf.UDFToBoolean;
-import org.apache.hadoop.hive.ql.udf.UDFToByte;
-import org.apache.hadoop.hive.ql.udf.UDFToDouble;
-import org.apache.hadoop.hive.ql.udf.UDFToFloat;
-import org.apache.hadoop.hive.ql.udf.UDFToInteger;
-import org.apache.hadoop.hive.ql.udf.UDFToLong;
-import org.apache.hadoop.hive.ql.udf.UDFToShort;
-import org.apache.hadoop.hive.ql.udf.UDFToString;
-import org.apache.hadoop.hive.ql.udf.UDFTrim;
-import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear;
-import org.apache.hadoop.hive.ql.udf.UDFYear;
+import org.apache.hadoop.hive.ql.udf.*;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
@@ -201,6 +145,7 @@ public class Vectorizer implements Physi
supportedGenericUDFs.add(UDFDayOfMonth.class);
supportedGenericUDFs.add(UDFLike.class);
+ supportedGenericUDFs.add(UDFRegExp.class);
supportedGenericUDFs.add(UDFSubstr.class);
supportedGenericUDFs.add(UDFLTrim.class);
supportedGenericUDFs.add(UDFRTrim.class);
@@ -352,7 +297,7 @@ public class Vectorizer implements Physi
}
boolean ret = validateOperator(op);
if (!ret) {
- LOG.info("Operator: "+op.getName()+" could not be vectorized.");
+ LOG.info("Operator: " + op.getName() + " could not be vectorized.");
return new Boolean(false);
}
}
@@ -451,6 +396,22 @@ public class Vectorizer implements Physi
}
}
+ private static class ValidatorVectorizationContext extends VectorizationContext {
+ private ValidatorVectorizationContext() {
+ super(null, -1);
+ }
+
+ @Override
+ protected int getInputColumnIndex(String name) {
+ return 0;
+ }
+
+ @Override
+ protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
+ return 0;
+ }
+ }
+
@Override
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
this.physicalContext = pctx;
@@ -509,7 +470,7 @@ public class Vectorizer implements Physi
List<ExprNodeDesc> filterExprs = desc.getFilters().get(posBigTable);
List<ExprNodeDesc> keyExprs = desc.getKeys().get(posBigTable);
List<ExprNodeDesc> valueExprs = desc.getExprs().get(posBigTable);
- return validateExprNodeDesc(filterExprs) &&
+ return validateExprNodeDesc(filterExprs, VectorExpressionDescriptor.Mode.FILTER) &&
validateExprNodeDesc(keyExprs) &&
validateExprNodeDesc(valueExprs);
}
@@ -535,7 +496,7 @@ public class Vectorizer implements Physi
private boolean validateFilterOperator(FilterOperator op) {
ExprNodeDesc desc = op.getConf().getPredicate();
- return validateExprNodeDesc(desc);
+ return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.FILTER);
}
private boolean validateGroupByOperator(GroupByOperator op) {
@@ -547,8 +508,12 @@ public class Vectorizer implements Physi
}
private boolean validateExprNodeDesc(List<ExprNodeDesc> descs) {
+ return validateExprNodeDesc(descs, VectorExpressionDescriptor.Mode.PROJECTION);
+ }
+
+ private boolean validateExprNodeDesc(List<ExprNodeDesc> descs, VectorExpressionDescriptor.Mode mode) {
for (ExprNodeDesc d : descs) {
- boolean ret = validateExprNodeDesc(d);
+ boolean ret = validateExprNodeDesc(d, mode);
if (!ret) {
return false;
}
@@ -566,7 +531,7 @@ public class Vectorizer implements Physi
return true;
}
- private boolean validateExprNodeDesc(ExprNodeDesc desc) {
+ private boolean validateExprNodeDescRecursive(ExprNodeDesc desc) {
boolean ret = validateDataType(desc.getTypeInfo().getTypeName());
if (!ret) {
return false;
@@ -580,12 +545,37 @@ public class Vectorizer implements Physi
}
if (desc.getChildren() != null) {
for (ExprNodeDesc d: desc.getChildren()) {
- validateExprNodeDesc(d);
+ boolean r = validateExprNodeDescRecursive(d);
+ if (!r) {
+ return false;
+ }
}
}
return true;
}
+ private boolean validateExprNodeDesc(ExprNodeDesc desc) {
+ return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.PROJECTION);
+ }
+
+ boolean validateExprNodeDesc(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) {
+ if (!validateExprNodeDescRecursive(desc)) {
+ return false;
+ }
+ try {
+ VectorizationContext vc = new ValidatorVectorizationContext();
+ if (vc.getVectorExpression(desc, mode) == null) {
+ return false;
+ }
+ } catch (HiveException e) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Failed to vectorize", e);
+ }
+ return false;
+ }
+ return true;
+ }
+
private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) {
if (VectorizationContext.isCustomUDF(genericUDFExpr)) {
return true;
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java?rev=1536787&r1=1536786&r2=1536787&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java Tue Oct 29 16:11:28 2013
@@ -25,18 +25,18 @@ import java.util.Map;
import junit.framework.Assert;
+import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFSumLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FuncAbsLongToLong;
import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.plan.AggregationDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
-import org.apache.hadoop.hive.ql.plan.GroupByDesc;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs;
+import org.apache.hadoop.hive.ql.plan.*;
+import org.apache.hadoop.hive.ql.udf.generic.*;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.junit.Before;
import org.junit.Test;
@@ -50,9 +50,28 @@ public class TestVectorizer {
Map<String, Integer> columnMap = new HashMap<String, Integer>();
columnMap.put("col1", 0);
columnMap.put("col2", 1);
+ columnMap.put("col3", 2);
//Generate vectorized expression
- vContext = new VectorizationContext(columnMap, 2);
+ vContext = new VectorizationContext(columnMap, 3);
+ }
+
+ @Description(name = "fake", value = "FAKE")
+ static class FakeGenericUDF extends GenericUDF {
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ return null;
+ }
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ return null;
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "fake";
+ }
}
@Test
@@ -96,4 +115,37 @@ public class TestVectorizer {
VectorUDAFSumLong udaf = (VectorUDAFSumLong) vectorOp.getAggregators()[0];
Assert.assertEquals(FuncAbsLongToLong.class, udaf.getInputExpression().getClass());
}
+
+ @Test
+ public void testValidateNestedExpressions() {
+ ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(Integer.class, "col1", "table", false);
+ ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc(new Integer(10));
+
+ GenericUDFOPGreaterThan udf = new GenericUDFOPGreaterThan();
+ ExprNodeGenericFuncDesc greaterExprDesc = new ExprNodeGenericFuncDesc();
+ greaterExprDesc.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+ greaterExprDesc.setGenericUDF(udf);
+ List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>(2);
+ children1.add(col1Expr);
+ children1.add(constDesc);
+ greaterExprDesc.setChildren(children1);
+
+ FakeGenericUDF udf2 = new FakeGenericUDF();
+ ExprNodeGenericFuncDesc nonSupportedExpr = new ExprNodeGenericFuncDesc();
+ nonSupportedExpr.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+ nonSupportedExpr.setGenericUDF(udf2);
+
+ GenericUDFOPAnd andUdf = new GenericUDFOPAnd();
+ ExprNodeGenericFuncDesc andExprDesc = new ExprNodeGenericFuncDesc();
+ andExprDesc.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
+ andExprDesc.setGenericUDF(andUdf);
+ List<ExprNodeDesc> children3 = new ArrayList<ExprNodeDesc>(2);
+ children3.add(greaterExprDesc);
+ children3.add(nonSupportedExpr);
+ andExprDesc.setChildren(children3);
+
+ Vectorizer v = new Vectorizer();
+ Assert.assertFalse(v.validateExprNodeDesc(andExprDesc, VectorExpressionDescriptor.Mode.FILTER));
+ Assert.assertFalse(v.validateExprNodeDesc(andExprDesc, VectorExpressionDescriptor.Mode.PROJECTION));
+ }
}