You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by tc...@apache.org on 2019/01/22 06:14:50 UTC
hive git commit: HIVE-20419: Vectorization: Prevent mutation of
VectorPartitionDesc after being used in a hashmap key (Teddy Choi,
reviewed by Gopal V)
Repository: hive
Updated Branches:
refs/heads/master 34c8ca432 -> cb74a685c
HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a hashmap key (Teddy Choi, reviewed by Gopal V)
Change-Id: Ie9ae156c6b25f39dfdab1742b0c35219c8275062
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cb74a685
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cb74a685
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cb74a685
Branch: refs/heads/master
Commit: cb74a685ce2b09f8deaefa9805361ef96c26eccb
Parents: 34c8ca4
Author: Teddy Choi <tc...@hortonworks.com>
Authored: Tue Jan 22 15:14:26 2019 +0900
Committer: Teddy Choi <tc...@hortonworks.com>
Committed: Tue Jan 22 15:14:26 2019 +0900
----------------------------------------------------------------------
.../hive/ql/optimizer/physical/Vectorizer.java | 130 +++++++++++--------
.../hive/ql/plan/VectorPartitionDesc.java | 34 ++---
.../hive/ql/io/orc/TestInputOutputFormat.java | 2 +-
3 files changed, 94 insertions(+), 72 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 0a1a25f..5023f2f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -249,6 +249,8 @@ public class Vectorizer implements PhysicalPlanResolver {
private static final Pattern supportedDataTypesPattern;
+ private static final TypeInfo[] EMPTY_TYPEINFO_ARRAY = new TypeInfo[0];
+
static {
StringBuilder patternBuilder = new StringBuilder();
patternBuilder.append("int");
@@ -1372,10 +1374,16 @@ public class Vectorizer implements PhysicalPlanResolver {
Set<String> inputFileFormatClassNameSet,
Map<VectorPartitionDesc, VectorPartitionDesc> vectorPartitionDescMap,
Set<String> enabledConditionsMetSet, ArrayList<String> enabledConditionsNotMetList,
- Set<Support> newSupportSet) {
+ Set<Support> newSupportSet, List<TypeInfo> dataTypeInfoList) {
Class<? extends InputFormat> inputFileFormatClass = pd.getInputFileFormatClass();
String inputFileFormatClassName = inputFileFormatClass.getName();
+ final TypeInfo[] dataTypeInfos;
+ if (dataTypeInfoList == null) {
+ dataTypeInfos = EMPTY_TYPEINFO_ARRAY;
+ } else {
+ dataTypeInfos = dataTypeInfoList.toArray(new TypeInfo[dataTypeInfoList.size()]);
+ }
// Always collect input file formats.
inputFileFormatClassNameSet.add(inputFileFormatClassName);
@@ -1401,7 +1409,9 @@ public class Vectorizer implements PhysicalPlanResolver {
addVectorPartitionDesc(
pd,
VectorPartitionDesc.createVectorizedInputFileFormat(
- inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)),
+ inputFileFormatClassName,
+ Utilities.isInputFileFormatSelfDescribing(pd),
+ dataTypeInfos),
vectorPartitionDescMap);
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
@@ -1427,7 +1437,9 @@ public class Vectorizer implements PhysicalPlanResolver {
addVectorPartitionDesc(
pd,
VectorPartitionDesc.createVectorizedInputFileFormat(
- inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)),
+ inputFileFormatClassName,
+ Utilities.isInputFileFormatSelfDescribing(pd),
+ dataTypeInfos),
vectorPartitionDescMap);
enabledConditionsMetSet.add(
@@ -1495,7 +1507,7 @@ public class Vectorizer implements PhysicalPlanResolver {
addVectorPartitionDesc(
pd,
VectorPartitionDesc.createVectorDeserialize(
- inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE),
+ inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE, dataTypeInfos),
vectorPartitionDescMap);
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
@@ -1506,7 +1518,7 @@ public class Vectorizer implements PhysicalPlanResolver {
addVectorPartitionDesc(
pd,
VectorPartitionDesc.createVectorDeserialize(
- inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY),
+ inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY, dataTypeInfos),
vectorPartitionDescMap);
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
@@ -1527,7 +1539,8 @@ public class Vectorizer implements PhysicalPlanResolver {
VectorPartitionDesc.createRowDeserialize(
inputFileFormatClassName,
Utilities.isInputFileFormatSelfDescribing(pd),
- deserializerClassName),
+ deserializerClassName,
+ dataTypeInfos),
vectorPartitionDescMap);
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname);
@@ -1728,30 +1741,17 @@ public class Vectorizer implements PhysicalPlanResolver {
continue;
}
Set<Support> newSupportSet = new TreeSet<Support>();
- final boolean isVerifiedVectorPartDesc =
- verifyAndSetVectorPartDesc(
- partDesc, isFullAcidTable,
- allTypeInfoList,
- inputFileFormatClassNameSet,
- vectorPartitionDescMap,
- enabledConditionsMetSet, enabledConditionsNotMetList,
- newSupportSet);
-
- if (!isVerifiedVectorPartDesc) {
-
- // Always set these so EXPLAIN can see.
- setValidateInputFormatAndSchemaEvolutionExplain(
- mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap,
- enabledConditionsMetSet, enabledConditionsNotMetList);
- // We consider this an enable issue, not a not vectorized issue.
- return new ImmutablePair<Boolean,Boolean>(false, true);
+ final List<TypeInfo> nextDataTypeInfoList;
+ final Deserializer deserializer;
+ final StructObjectInspector partObjectInspector;
+ try {
+ deserializer = partDesc.getDeserializer(hiveConf);
+ partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
+ } catch (Exception e) {
+ throw new SemanticException(e);
}
- handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet);
-
- VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
-
if (isFirst) {
/*
@@ -1778,17 +1778,55 @@ public class Vectorizer implements PhysicalPlanResolver {
isFirst = false;
}
+ if (Utilities.isInputFileFormatSelfDescribing(partDesc)) {
+
+ /*
+ * Self-Describing Input Format will convert its data to the table schema. So, there
+ * will be no VectorMapOperator conversion needed.
+ */
+ nextDataTypeInfoList = tableDataTypeInfoList;
+ } else {
+ String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector);
+
+ /*
+ * We convert to an array of TypeInfo using a library routine since it parses the
+ * information and can handle use of different separators, etc. We cannot use the
+ * raw type string for comparison in the map because of the different separators used.
+ */
+ nextDataTypeInfoList =
+ TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString);
+ }
+
+ // HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a
+ // hashmap key
+ final boolean isVerifiedVectorPartDesc =
+ verifyAndSetVectorPartDesc(
+ partDesc, isFullAcidTable,
+ allTypeInfoList,
+ inputFileFormatClassNameSet,
+ vectorPartitionDescMap,
+ enabledConditionsMetSet, enabledConditionsNotMetList,
+ newSupportSet,
+ nextDataTypeInfoList);
+
+ final VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
+
+ if (!isVerifiedVectorPartDesc) {
+
+ // Always set these so EXPLAIN can see.
+ setValidateInputFormatAndSchemaEvolutionExplain(
+ mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap,
+ enabledConditionsMetSet, enabledConditionsNotMetList);
+
+ // We consider this an enable issue, not a not vectorized issue.
+ return new ImmutablePair<Boolean,Boolean>(false, true);
+ }
+
+ handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet);
+
// We need to get the partition's column names from the partition serde.
// (e.g. Avro provides the table schema and ignores the partition schema..).
//
- Deserializer deserializer;
- StructObjectInspector partObjectInspector;
- try {
- deserializer = partDesc.getDeserializer(hiveConf);
- partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
- } catch (Exception e) {
- throw new SemanticException(e);
- }
String nextDataColumnsString = ObjectInspectorUtils.getFieldNames(partObjectInspector);
String[] nextDataColumns = nextDataColumnsString.split(",");
List<String> nextDataColumnList = Arrays.asList(nextDataColumns);
@@ -1833,26 +1871,8 @@ public class Vectorizer implements PhysicalPlanResolver {
}
boolean isPartitionRowConversion = false;
- List<TypeInfo> nextDataTypeInfoList;
- if (vectorPartDesc.getIsInputFileFormatSelfDescribing()) {
-
- /*
- * Self-Describing Input Format will convert its data to the table schema. So, there
- * will be no VectorMapOperator conversion needed.
- */
- nextDataTypeInfoList = tableDataTypeInfoList;
-
- } else {
- String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector);
-
- /*
- * We convert to an array of TypeInfo using a library routine since it parses the
- * information and can handle use of different separators, etc. We cannot use the
- * raw type string for comparison in the map because of the different separators used.
- */
- nextDataTypeInfoList =
- TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString);
+ if (!vectorPartDesc.getIsInputFileFormatSelfDescribing()) {
final int nextDataTypeInfoSize = nextDataTypeInfoList.size();
if (nextDataTypeInfoSize > tableDataTypeInfoList.size()) {
enabledConditionsNotMetList.add(
@@ -1891,8 +1911,6 @@ public class Vectorizer implements PhysicalPlanResolver {
enabledConditionsMetSet, enabledConditionsNotMetList);
return new ImmutablePair<Boolean,Boolean>(false, true);
}
-
- vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList);
}
// For now, we don't know which virtual columns are going to be included. We'll add them
http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
index 2c8904d..dd597fb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
@@ -77,13 +77,14 @@ public class VectorPartitionDesc {
private TypeInfo[] dataTypeInfos;
private VectorPartitionDesc(String inputFileFormatClassName,
- boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType) {
+ boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType,
+ TypeInfo[] dataTypeInfos) {
this.vectorMapOperatorReadType = vectorMapOperatorReadType;
this.vectorDeserializeType = VectorDeserializeType.NONE;
this.inputFileFormatClassName = inputFileFormatClassName;
rowDeserializerClassName = null;
this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing;
- dataTypeInfos = null;
+ this.dataTypeInfos = dataTypeInfos;
}
/**
@@ -93,13 +94,13 @@ public class VectorPartitionDesc {
* @param needsDataTypeConversionCheck
*/
private VectorPartitionDesc(String inputFileFormatClassName,
- VectorDeserializeType vectorDeserializeType) {
+ VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) {
this.vectorMapOperatorReadType = VectorMapOperatorReadType.VECTOR_DESERIALIZE;
this.vectorDeserializeType = vectorDeserializeType;
this.inputFileFormatClassName = inputFileFormatClassName;
rowDeserializerClassName = null;
isInputFileFormatSelfDescribing = false;
- dataTypeInfos = null;
+ this.dataTypeInfos = dataTypeInfos;
}
/**
@@ -108,32 +109,35 @@ public class VectorPartitionDesc {
* @param inputFileFormatClassName
*/
private VectorPartitionDesc(String inputFileFormatClassName,
- boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) {
+ boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName,
+ TypeInfo[] dataTypeInfos) {
this.vectorMapOperatorReadType = VectorMapOperatorReadType.ROW_DESERIALIZE;
this.vectorDeserializeType = VectorDeserializeType.NONE;
this.inputFileFormatClassName = inputFileFormatClassName;
this.rowDeserializerClassName = rowDeserializerClassName;
this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing;
- dataTypeInfos = null;
+ this.dataTypeInfos = dataTypeInfos;
}
public static VectorPartitionDesc createVectorizedInputFileFormat(String inputFileFormatClassName,
- boolean isInputFileFormatSelfDescribing) {
+ boolean isInputFileFormatSelfDescribing, TypeInfo[] dataTypeInfos) {
return new VectorPartitionDesc(
inputFileFormatClassName,
isInputFileFormatSelfDescribing,
- VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT);
+ VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT,
+ dataTypeInfos);
}
public static VectorPartitionDesc createVectorDeserialize(String inputFileFormatClassName,
- VectorDeserializeType vectorDeserializeType) {
- return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType);
+ VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) {
+ return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos);
}
public static VectorPartitionDesc createRowDeserialize(String inputFileFormatClassName,
- boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) {
+ boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName,
+ TypeInfo[] dataTypeInfos) {
return new VectorPartitionDesc(rowDeserializerClassName, isInputFileFormatSelfDescribing,
- inputFileFormatClassName);
+ inputFileFormatClassName, dataTypeInfos);
}
@Override
@@ -142,14 +146,14 @@ public class VectorPartitionDesc {
switch (vectorMapOperatorReadType) {
case VECTORIZED_INPUT_FILE_FORMAT:
result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing,
- vectorMapOperatorReadType);
+ vectorMapOperatorReadType, dataTypeInfos);
break;
case VECTOR_DESERIALIZE:
- result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType);
+ result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos);
break;
case ROW_DESERIALIZE:
result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing,
- rowDeserializerClassName);
+ rowDeserializerClassName, dataTypeInfos);
break;
default:
throw new RuntimeException("Unexpected vector map operator read type " + vectorMapOperatorReadType.name());
http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 3a83408..91458ea 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -2208,7 +2208,7 @@ public class TestInputOutputFormat {
PartitionDesc part = new PartitionDesc(tbl, partSpec);
if (isVectorized) {
part.setVectorPartitionDesc(
- VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
+ VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null));
}
partMap.put(path, part);
}