You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by tc...@apache.org on 2019/01/22 06:14:50 UTC
hive git commit: HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a hashmap key (Teddy Choi, reviewed by Gopal V)

Repository: hive
Updated Branches:
  refs/heads/master 34c8ca432 -> cb74a685c


HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a hashmap key (Teddy Choi, reviewed by Gopal V)

Change-Id: Ie9ae156c6b25f39dfdab1742b0c35219c8275062


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cb74a685
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cb74a685
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cb74a685

Branch: refs/heads/master
Commit: cb74a685ce2b09f8deaefa9805361ef96c26eccb
Parents: 34c8ca4
Author: Teddy Choi <tc...@hortonworks.com>
Authored: Tue Jan 22 15:14:26 2019 +0900
Committer: Teddy Choi <tc...@hortonworks.com>
Committed: Tue Jan 22 15:14:26 2019 +0900

----------------------------------------------------------------------
 .../hive/ql/optimizer/physical/Vectorizer.java  | 130 +++++++++++--------
 .../hive/ql/plan/VectorPartitionDesc.java       |  34 ++---
 .../hive/ql/io/orc/TestInputOutputFormat.java   |   2 +-
 3 files changed, 94 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 0a1a25f..5023f2f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -249,6 +249,8 @@ public class Vectorizer implements PhysicalPlanResolver {
 
   private static final Pattern supportedDataTypesPattern;
 
+  private static final TypeInfo[] EMPTY_TYPEINFO_ARRAY = new TypeInfo[0];
+
   static {
     StringBuilder patternBuilder = new StringBuilder();
     patternBuilder.append("int");
@@ -1372,10 +1374,16 @@ public class Vectorizer implements PhysicalPlanResolver {
         Set<String> inputFileFormatClassNameSet,
         Map<VectorPartitionDesc, VectorPartitionDesc> vectorPartitionDescMap,
         Set<String> enabledConditionsMetSet, ArrayList<String> enabledConditionsNotMetList,
-        Set<Support> newSupportSet) {
+        Set<Support> newSupportSet, List<TypeInfo> dataTypeInfoList) {
 
       Class<? extends InputFormat> inputFileFormatClass = pd.getInputFileFormatClass();
       String inputFileFormatClassName = inputFileFormatClass.getName();
+      final TypeInfo[] dataTypeInfos;
+      if (dataTypeInfoList == null) {
+        dataTypeInfos = EMPTY_TYPEINFO_ARRAY;
+      } else {
+        dataTypeInfos = dataTypeInfoList.toArray(new TypeInfo[dataTypeInfoList.size()]);
+      }
 
       // Always collect input file formats.
       inputFileFormatClassNameSet.add(inputFileFormatClassName);
@@ -1401,7 +1409,9 @@ public class Vectorizer implements PhysicalPlanResolver {
         addVectorPartitionDesc(
             pd,
             VectorPartitionDesc.createVectorizedInputFileFormat(
-                inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)),
+                inputFileFormatClassName,
+                Utilities.isInputFileFormatSelfDescribing(pd),
+                dataTypeInfos),
             vectorPartitionDescMap);
 
         enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
@@ -1427,7 +1437,9 @@ public class Vectorizer implements PhysicalPlanResolver {
           addVectorPartitionDesc(
               pd,
               VectorPartitionDesc.createVectorizedInputFileFormat(
-                  inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)),
+                  inputFileFormatClassName,
+                  Utilities.isInputFileFormatSelfDescribing(pd),
+                  dataTypeInfos),
               vectorPartitionDescMap);
 
           enabledConditionsMetSet.add(
@@ -1495,7 +1507,7 @@ public class Vectorizer implements PhysicalPlanResolver {
             addVectorPartitionDesc(
                 pd,
                 VectorPartitionDesc.createVectorDeserialize(
-                    inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE),
+                    inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE, dataTypeInfos),
                 vectorPartitionDescMap);
 
             enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
@@ -1506,7 +1518,7 @@ public class Vectorizer implements PhysicalPlanResolver {
           addVectorPartitionDesc(
               pd,
               VectorPartitionDesc.createVectorDeserialize(
-                  inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY),
+                  inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY, dataTypeInfos),
               vectorPartitionDescMap);
 
           enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
@@ -1527,7 +1539,8 @@ public class Vectorizer implements PhysicalPlanResolver {
               VectorPartitionDesc.createRowDeserialize(
                   inputFileFormatClassName,
                   Utilities.isInputFileFormatSelfDescribing(pd),
-                  deserializerClassName),
+                  deserializerClassName,
+                  dataTypeInfos),
               vectorPartitionDescMap);
 
           enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname);
@@ -1728,30 +1741,17 @@ public class Vectorizer implements PhysicalPlanResolver {
           continue;
         }
         Set<Support> newSupportSet = new TreeSet<Support>();
-        final boolean isVerifiedVectorPartDesc =
-            verifyAndSetVectorPartDesc(
-              partDesc, isFullAcidTable,
-              allTypeInfoList,
-              inputFileFormatClassNameSet,
-              vectorPartitionDescMap,
-              enabledConditionsMetSet, enabledConditionsNotMetList,
-              newSupportSet);
-
-        if (!isVerifiedVectorPartDesc) {
-
-          // Always set these so EXPLAIN can see.
-          setValidateInputFormatAndSchemaEvolutionExplain(
-              mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap,
-              enabledConditionsMetSet, enabledConditionsNotMetList);
 
-          // We consider this an enable issue, not a not vectorized issue.
-          return new ImmutablePair<Boolean,Boolean>(false, true);
+        final List<TypeInfo> nextDataTypeInfoList;
+        final Deserializer deserializer;
+        final StructObjectInspector partObjectInspector;
+        try {
+          deserializer = partDesc.getDeserializer(hiveConf);
+          partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
+        } catch (Exception e) {
+          throw new SemanticException(e);
         }
 
-        handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet);
-
-        VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
-
         if (isFirst) {
 
           /*
@@ -1778,17 +1778,55 @@ public class Vectorizer implements PhysicalPlanResolver {
           isFirst = false;
         }
 
+        if (Utilities.isInputFileFormatSelfDescribing(partDesc)) {
+
+          /*
+           * Self-Describing Input Format will convert its data to the table schema. So, there
+           * will be no VectorMapOperator conversion needed.
+           */
+          nextDataTypeInfoList = tableDataTypeInfoList;
+        } else {
+          String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector);
+
+          /*
+           * We convert to an array of TypeInfo using a library routine since it parses the
+           * information and can handle use of different separators, etc.  We cannot use the
+           * raw type string for comparison in the map because of the different separators used.
+           */
+          nextDataTypeInfoList =
+              TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString);
+        }
+
+        // HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a
+        // hashmap key
+        final boolean isVerifiedVectorPartDesc =
+            verifyAndSetVectorPartDesc(
+              partDesc, isFullAcidTable,
+              allTypeInfoList,
+              inputFileFormatClassNameSet,
+              vectorPartitionDescMap,
+              enabledConditionsMetSet, enabledConditionsNotMetList,
+              newSupportSet,
+              nextDataTypeInfoList);
+
+        final VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
+
+        if (!isVerifiedVectorPartDesc) {
+
+          // Always set these so EXPLAIN can see.
+          setValidateInputFormatAndSchemaEvolutionExplain(
+              mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap,
+              enabledConditionsMetSet, enabledConditionsNotMetList);
+
+          // We consider this an enable issue, not a not vectorized issue.
+          return new ImmutablePair<Boolean,Boolean>(false, true);
+        }
+
+        handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet);
+
         // We need to get the partition's column names from the partition serde.
         // (e.g. Avro provides the table schema and ignores the partition schema..).
         //
-        Deserializer deserializer;
-        StructObjectInspector partObjectInspector;
-        try {
-          deserializer = partDesc.getDeserializer(hiveConf);
-          partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
-        } catch (Exception e) {
-          throw new SemanticException(e);
-        }
         String nextDataColumnsString = ObjectInspectorUtils.getFieldNames(partObjectInspector);
         String[] nextDataColumns = nextDataColumnsString.split(",");
         List<String> nextDataColumnList = Arrays.asList(nextDataColumns);
@@ -1833,26 +1871,8 @@ public class Vectorizer implements PhysicalPlanResolver {
         }
 
         boolean isPartitionRowConversion = false;
-        List<TypeInfo> nextDataTypeInfoList;
-        if (vectorPartDesc.getIsInputFileFormatSelfDescribing()) {
-
-          /*
-           * Self-Describing Input Format will convert its data to the table schema. So, there
-           * will be no VectorMapOperator conversion needed.
-           */
-          nextDataTypeInfoList = tableDataTypeInfoList;
-
-        } else {
-          String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector);
-
-          /*
-           * We convert to an array of TypeInfo using a library routine since it parses the
-           * information and can handle use of different separators, etc.  We cannot use the
-           * raw type string for comparison in the map because of the different separators used.
-           */
-          nextDataTypeInfoList =
-              TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString);
 
+        if (!vectorPartDesc.getIsInputFileFormatSelfDescribing()) {
           final int nextDataTypeInfoSize = nextDataTypeInfoList.size();
           if (nextDataTypeInfoSize > tableDataTypeInfoList.size()) {
             enabledConditionsNotMetList.add(
@@ -1891,8 +1911,6 @@ public class Vectorizer implements PhysicalPlanResolver {
               enabledConditionsMetSet, enabledConditionsNotMetList);
           return new ImmutablePair<Boolean,Boolean>(false, true);
         }
-
-        vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList);
       }
 
       // For now, we don't know which virtual columns are going to be included.  We'll add them

http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
index 2c8904d..dd597fb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java
@@ -77,13 +77,14 @@ public class VectorPartitionDesc  {
   private TypeInfo[] dataTypeInfos;
 
   private VectorPartitionDesc(String inputFileFormatClassName,
-      boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType) {
+      boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType,
+      TypeInfo[] dataTypeInfos) {
     this.vectorMapOperatorReadType = vectorMapOperatorReadType;
     this.vectorDeserializeType = VectorDeserializeType.NONE;
     this.inputFileFormatClassName = inputFileFormatClassName;
     rowDeserializerClassName = null;
     this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing;
-    dataTypeInfos = null;
+    this.dataTypeInfos = dataTypeInfos;
   }
 
   /**
@@ -93,13 +94,13 @@ public class VectorPartitionDesc  {
    * @param needsDataTypeConversionCheck
    */
   private VectorPartitionDesc(String inputFileFormatClassName,
-      VectorDeserializeType vectorDeserializeType) {
+      VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) {
     this.vectorMapOperatorReadType = VectorMapOperatorReadType.VECTOR_DESERIALIZE;
     this.vectorDeserializeType = vectorDeserializeType;
     this.inputFileFormatClassName = inputFileFormatClassName;
     rowDeserializerClassName = null;
     isInputFileFormatSelfDescribing = false;
-    dataTypeInfos = null;
+    this.dataTypeInfos = dataTypeInfos;
   }
 
   /**
@@ -108,32 +109,35 @@ public class VectorPartitionDesc  {
    * @param inputFileFormatClassName
    */
   private VectorPartitionDesc(String inputFileFormatClassName,
-      boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) {
+      boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName,
+      TypeInfo[] dataTypeInfos) {
     this.vectorMapOperatorReadType = VectorMapOperatorReadType.ROW_DESERIALIZE;
     this.vectorDeserializeType = VectorDeserializeType.NONE;
     this.inputFileFormatClassName = inputFileFormatClassName;
     this.rowDeserializerClassName = rowDeserializerClassName;
     this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing;
-    dataTypeInfos = null;
+    this.dataTypeInfos = dataTypeInfos;
   }
 
   public static VectorPartitionDesc createVectorizedInputFileFormat(String inputFileFormatClassName,
-      boolean isInputFileFormatSelfDescribing) {
+      boolean isInputFileFormatSelfDescribing, TypeInfo[] dataTypeInfos) {
     return new VectorPartitionDesc(
         inputFileFormatClassName,
         isInputFileFormatSelfDescribing,
-        VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT);
+        VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT,
+        dataTypeInfos);
   }
 
   public static VectorPartitionDesc createVectorDeserialize(String inputFileFormatClassName,
-      VectorDeserializeType vectorDeserializeType) {
-    return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType);
+      VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) {
+    return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos);
   }
 
   public static VectorPartitionDesc createRowDeserialize(String inputFileFormatClassName,
-      boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) {
+      boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName,
+      TypeInfo[] dataTypeInfos) {
     return new VectorPartitionDesc(rowDeserializerClassName, isInputFileFormatSelfDescribing,
-        inputFileFormatClassName);
+        inputFileFormatClassName, dataTypeInfos);
   }
 
   @Override
@@ -142,14 +146,14 @@ public class VectorPartitionDesc  {
     switch (vectorMapOperatorReadType) {
     case VECTORIZED_INPUT_FILE_FORMAT:
       result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing,
-          vectorMapOperatorReadType);
+          vectorMapOperatorReadType, dataTypeInfos);
       break;
     case VECTOR_DESERIALIZE:
-      result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType);
+      result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos);
       break;
     case ROW_DESERIALIZE:
       result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing,
-          rowDeserializerClassName);
+          rowDeserializerClassName, dataTypeInfos);
       break;
     default:
       throw new RuntimeException("Unexpected vector map operator read type " + vectorMapOperatorReadType.name());

http://git-wip-us.apache.org/repos/asf/hive/blob/cb74a685/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 3a83408..91458ea 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -2208,7 +2208,7 @@ public class TestInputOutputFormat {
       PartitionDesc part = new PartitionDesc(tbl, partSpec);
       if (isVectorized) {
         part.setVectorPartitionDesc(
-            VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
+            VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null));
       }
       partMap.put(path, part);
     }