You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ab...@apache.org on 2022/07/26 06:40:47 UTC

[hive] branch master updated: HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)

This is an automated email from the ASF dual-hosted git repository.

abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new b8ed1f8b434 HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)
b8ed1f8b434 is described below

commit b8ed1f8b434fd432121868d00f153eddf8f7eb77
Author: Bodor Laszlo <bo...@gmail.com>
AuthorDate: Tue Jul 26 08:40:35 2022 +0200

    HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)
---
 data/files/scratch_col_issue_test_data/data.csv    |   2 +
 .../hive/ql/exec/vector/VectorizationContext.java  |  18 +-
 .../queries/clientpositive/scratch_col_issue.q     |  71 +++++
 .../clientpositive/llap/scratch_col_issue.q.out    | 332 +++++++++++++++++++++
 4 files changed, 422 insertions(+), 1 deletion(-)

diff --git a/data/files/scratch_col_issue_test_data/data.csv b/data/files/scratch_col_issue_test_data/data.csv
new file mode 100644
index 00000000000..c43afddf3b0
--- /dev/null
+++ b/data/files/scratch_col_issue_test_data/data.csv
@@ -0,0 +1,2 @@
+8800;MMDA;NULL
+8800;CertificateOfDeposit;NULL
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index 6d0e4899e68..b5b0b764cd0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -839,6 +839,20 @@ import com.google.common.annotations.VisibleForTesting;
       markedScratchColumns = Arrays.copyOf(scratchColumnTrackWasUsed, scratchColumnTrackWasUsed.length);
     }
 
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append(String.format(
+          "OutputColumnManager: initialOutputCol: %d, outputColCount: %d, usedOutputColumns#: %d"
+              + ", reuseScratchColumns: %s, output cols:",
+          initialOutputCol, outputColCount, usedOutputColumns.size(), reuseScratchColumns));
+      for (int i = 0; i < outputColCount; i++) {
+        builder.append(String.format(
+            "\n%d (%d): used: %s, scratchVectorTypeName: %s" + ", physicalVariation: %s, trackWasUsed: %s", i,
+            i + initialOutputCol, usedOutputColumns.contains(i), scratchVectorTypeNames[i],
+            scratchDataTypePhysicalVariations[i], scratchColumnTrackWasUsed[i]));
+      }
+      return builder.toString();
+    }
   }
 
   public int allocateScratchColumn(TypeInfo typeInfo) throws HiveException {
@@ -2461,7 +2475,9 @@ import com.google.common.annotations.VisibleForTesting;
       return;
     }
     for (VectorExpression v : vectorChildren) {
-      if (!(v instanceof IdentityExpression)) {
+      if (!(v instanceof IdentityExpression
+            // it's not safe to reuse ConstantVectorExpression's output as a scratch column, see HIVE-26408
+            || v instanceof ConstantVectorExpression)) {
         ocm.freeOutputColumn(v.getOutputColumnNum());
       }
     }
diff --git a/ql/src/test/queries/clientpositive/scratch_col_issue.q b/ql/src/test/queries/clientpositive/scratch_col_issue.q
new file mode 100644
index 00000000000..532bfa181c2
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/scratch_col_issue.q
@@ -0,0 +1,71 @@
+
+set hive.fetch.task.conversion=none;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.vectorized.execution.enabled=true;
+set hive.vectorized.reuse.scratch.columns=true;
+set hive.auto.convert.join=true;
+
+CREATE EXTERNAL TABLE scratch_col_issue_txt(
+  `id` int,
+  `value` string,
+  `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+LOCATION '../../data/files/scratch_col_issue_test_data';
+
+CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt;
+
+DESCRIBE FORMATTED scratch_col_issue;
+
+EXPLAIN VECTORIZATION DETAIL SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  );
+
+SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  );
diff --git a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
new file mode 100644
index 00000000000..2e0784f5d23
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
@@ -0,0 +1,332 @@
+PREHOOK: query: CREATE EXTERNAL TABLE scratch_col_issue_txt(
+  `id` int,
+  `value` string,
+  `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@scratch_col_issue_txt
+POSTHOOK: query: CREATE EXTERNAL TABLE scratch_col_issue_txt(
+  `id` int,
+  `value` string,
+  `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@scratch_col_issue_txt
+PREHOOK: query: CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@scratch_col_issue_txt
+PREHOOK: Output: database:default
+PREHOOK: Output: default@scratch_col_issue
+POSTHOOK: query: CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@scratch_col_issue_txt
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@scratch_col_issue
+POSTHOOK: Lineage: scratch_col_issue.date_string SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:date_string, type:string, comment:null), ]
+POSTHOOK: Lineage: scratch_col_issue.id SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: scratch_col_issue.value SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: DESCRIBE FORMATTED scratch_col_issue
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@scratch_col_issue
+POSTHOOK: query: DESCRIBE FORMATTED scratch_col_issue
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@scratch_col_issue
+# col_name            	data_type           	comment             
+id                  	int                 	                    
+value               	string              	                    
+date_string         	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+#### A masked pattern was here ####
+Retention:          	0                   	 
+#### A masked pattern was here ####
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"date_string\":\"true\",\"id\":\"true\",\"value\":\"true\"}}
+	bucketing_version   	2                   
+	numFiles            	1                   
+	numRows             	2                   
+	rawDataSize         	44                  
+	totalSize           	46                  
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
+PREHOOK: query: EXPLAIN VECTORIZATION DETAIL SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  )
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Map 2 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: scratch_col_issue
+                  filterExpr: COALESCE(id,'') is not null (type: boolean)
+                  Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:id:int, 1:value:string, 2:date_string:string, 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 4:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 7:string)(children: VectorCoalesce(columns [5, 6])(children: CastLongToString(col 0:int) -> 5:string, ConstantVectorExpression(val ) -> 6:string) -> 7:string)
+                    predicate: COALESCE(id,'') is not null (type: boolean)
+                    Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: id (type: int), value (type: string), date_string (type: string)
+                      outputColumnNames: _col0, _col1, _col2
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1, 2]
+                      Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+                      Map Join Operator
+                        condition map:
+                             Left Semi Join 0 to 1
+                        keys:
+                          0 COALESCE(_col0,'') (type: string)
+                          1 _col0 (type: string)
+                        Map Join Vectorization:
+                            bigTableKeyColumns: 8:string
+                            bigTableKeyExpressions: VectorCoalesce(columns [5, 7])(children: CastLongToString(col 0:int) -> 5:string, ConstantVectorExpression(val ) -> 7:string) -> 8:string
+                            bigTableRetainColumnNums: [1, 2]
+                            bigTableValueColumns: 1:string, 2:string
+                            className: VectorMapJoinLeftSemiStringOperator
+                            native: true
+                            nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS true
+                            nonOuterSmallTableKeyMapping: []
+                            projectedOutput: 1:string, 2:string
+                            hashTableImplementationType: OPTIMIZED
+                        outputColumnNames: _col1, _col2
+                        input vertices:
+                          1 Map 2
+                        Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                        Select Operator
+                          expressions: if((_col1) IN ('TermDeposit', 'RecurringDeposit', 'CertificateOfDeposit'), COALESCE(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'),' '), '') (type: string)
+                          outputColumnNames: _col0
+                          Select Vectorization:
+                              className: VectorSelectOperator
+                              native: true
+                              projectedOutputColumnNums: [14]
+                              selectExpressions: IfExprCondExprColumn(col 9:boolean, col 13:string, col 5:string)(children: StringColumnInList(col 1, values TermDeposit, RecurringDeposit, CertificateOfDeposit) -> 9:boolean, VectorCoalesce(columns [5, 12])(children: VectorUDFAdaptor(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'))(children: VectorUDFUnixTimeStampDate(col 10)(children: CastStringToDate(col 2:string) -> 10:date) -> 11:bigint) -> 5:string, ConstantVector [...]
+                          Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                          File Output Operator
+                            compressed: false
+                            File Sink Vectorization:
+                                className: VectorFileSinkOperator
+                                native: false
+                            Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                            table:
+                                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: org.apache.hadoop.mapred.TextInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: true
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    includeColumns: [0, 1, 2]
+                    dataColumns: id:int, value:string, date_string:string
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [string, string, string, string, bigint, bigint, bigint, string, string, string]
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: _dummy_table
+                  Row Limit Per Split: 1
+                  Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: split('8800', ',') (type: array<string>)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+                    UDTF Operator
+                      Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+                      function name: explode
+                      Filter Operator
+                        predicate: col is not null (type: boolean)
+                        Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+                        Select Operator
+                          expressions: col (type: string)
+                          outputColumnNames: _col0
+                          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                          Group By Operator
+                            keys: _col0 (type: string)
+                            minReductionHashAggr: 0.4
+                            mode: hash
+                            outputColumnNames: _col0
+                            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                            Reduce Output Operator
+                              key expressions: _col0 (type: string)
+                              null sort order: z
+                              sort order: +
+                              Map-reduce partition columns: _col0 (type: string)
+                              Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: no inputs
+            Map Vectorization:
+                enabled: false
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                enabledConditionsNotMet: Could not enable vectorization due to partition column names size 1 is greater than the number of table column names size 0 IS false
+                inputFileFormats: org.apache.hadoop.hive.ql.io.NullRowsInputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  )
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+  CASE WHEN scratch_col_issue.value in (
+    'TermDeposit', 'RecurringDeposit',
+    'CertificateOfDeposit'
+  ) THEN NVL(
+    (
+      from_unixtime(
+        unix_timestamp(
+          cast(scratch_col_issue.date_string as date)
+        ),
+        'MM-dd-yyyy'
+      )
+    ),
+    ' '
+  ) ELSE '' END AS MAT_DTE
+FROM
+  scratch_col_issue
+WHERE
+  NVL(scratch_col_issue.id, '') IN (
+      SELECT
+        EXPLODE(
+          SPLIT('8800', ',')
+        )
+  )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+
+