You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ab...@apache.org on 2022/07/26 06:40:47 UTC
[hive] branch master updated: HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)
This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new b8ed1f8b434 HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)
b8ed1f8b434 is described below
commit b8ed1f8b434fd432121868d00f153eddf8f7eb77
Author: Bodor Laszlo <bo...@gmail.com>
AuthorDate: Tue Jul 26 08:40:35 2022 +0200
HIVE-26408: Vectorization: Fix deallocation of scratch columns, don't reuse a child ConstantVectorExpression as an output (#3452) (Laszlo Bodor reviewed by Ayush Saxena)
---
data/files/scratch_col_issue_test_data/data.csv | 2 +
.../hive/ql/exec/vector/VectorizationContext.java | 18 +-
.../queries/clientpositive/scratch_col_issue.q | 71 +++++
.../clientpositive/llap/scratch_col_issue.q.out | 332 +++++++++++++++++++++
4 files changed, 422 insertions(+), 1 deletion(-)
diff --git a/data/files/scratch_col_issue_test_data/data.csv b/data/files/scratch_col_issue_test_data/data.csv
new file mode 100644
index 00000000000..c43afddf3b0
--- /dev/null
+++ b/data/files/scratch_col_issue_test_data/data.csv
@@ -0,0 +1,2 @@
+8800;MMDA;NULL
+8800;CertificateOfDeposit;NULL
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index 6d0e4899e68..b5b0b764cd0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -839,6 +839,20 @@ import com.google.common.annotations.VisibleForTesting;
markedScratchColumns = Arrays.copyOf(scratchColumnTrackWasUsed, scratchColumnTrackWasUsed.length);
}
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(String.format(
+ "OutputColumnManager: initialOutputCol: %d, outputColCount: %d, usedOutputColumns#: %d"
+ + ", reuseScratchColumns: %s, output cols:",
+ initialOutputCol, outputColCount, usedOutputColumns.size(), reuseScratchColumns));
+ for (int i = 0; i < outputColCount; i++) {
+ builder.append(String.format(
+ "\n%d (%d): used: %s, scratchVectorTypeName: %s" + ", physicalVariation: %s, trackWasUsed: %s", i,
+ i + initialOutputCol, usedOutputColumns.contains(i), scratchVectorTypeNames[i],
+ scratchDataTypePhysicalVariations[i], scratchColumnTrackWasUsed[i]));
+ }
+ return builder.toString();
+ }
}
public int allocateScratchColumn(TypeInfo typeInfo) throws HiveException {
@@ -2461,7 +2475,9 @@ import com.google.common.annotations.VisibleForTesting;
return;
}
for (VectorExpression v : vectorChildren) {
- if (!(v instanceof IdentityExpression)) {
+ if (!(v instanceof IdentityExpression
+ // it's not safe to reuse ConstantVectorExpression's output as a scratch column, see HIVE-26408
+ || v instanceof ConstantVectorExpression)) {
ocm.freeOutputColumn(v.getOutputColumnNum());
}
}
diff --git a/ql/src/test/queries/clientpositive/scratch_col_issue.q b/ql/src/test/queries/clientpositive/scratch_col_issue.q
new file mode 100644
index 00000000000..532bfa181c2
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/scratch_col_issue.q
@@ -0,0 +1,71 @@
+
+set hive.fetch.task.conversion=none;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.vectorized.execution.enabled=true;
+set hive.vectorized.reuse.scratch.columns=true;
+set hive.auto.convert.join=true;
+
+CREATE EXTERNAL TABLE scratch_col_issue_txt(
+ `id` int,
+ `value` string,
+ `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+LOCATION '../../data/files/scratch_col_issue_test_data';
+
+CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt;
+
+DESCRIBE FORMATTED scratch_col_issue;
+
+EXPLAIN VECTORIZATION DETAIL SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ );
+
+SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ );
diff --git a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
new file mode 100644
index 00000000000..2e0784f5d23
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
@@ -0,0 +1,332 @@
+PREHOOK: query: CREATE EXTERNAL TABLE scratch_col_issue_txt(
+ `id` int,
+ `value` string,
+ `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@scratch_col_issue_txt
+POSTHOOK: query: CREATE EXTERNAL TABLE scratch_col_issue_txt(
+ `id` int,
+ `value` string,
+ `date_string` string)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ';'
+LINES TERMINATED BY '\n'
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@scratch_col_issue_txt
+PREHOOK: query: CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@scratch_col_issue_txt
+PREHOOK: Output: database:default
+PREHOOK: Output: default@scratch_col_issue
+POSTHOOK: query: CREATE TABLE scratch_col_issue
+AS SELECT * FROM scratch_col_issue_txt
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@scratch_col_issue_txt
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@scratch_col_issue
+POSTHOOK: Lineage: scratch_col_issue.date_string SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:date_string, type:string, comment:null), ]
+POSTHOOK: Lineage: scratch_col_issue.id SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: scratch_col_issue.value SIMPLE [(scratch_col_issue_txt)scratch_col_issue_txt.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: DESCRIBE FORMATTED scratch_col_issue
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@scratch_col_issue
+POSTHOOK: query: DESCRIBE FORMATTED scratch_col_issue
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@scratch_col_issue
+# col_name data_type comment
+id int
+value string
+date_string string
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"date_string\":\"true\",\"id\":\"true\",\"value\":\"true\"}}
+ bucketing_version 2
+ numFiles 1
+ numRows 2
+ rawDataSize 44
+ totalSize 46
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: EXPLAIN VECTORIZATION DETAIL SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ )
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Map 2 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: scratch_col_issue
+ filterExpr: COALESCE(id,'') is not null (type: boolean)
+ Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:id:int, 1:value:string, 2:date_string:string, 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 4:ROW__IS__DELETED:boolean]
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: SelectColumnIsNotNull(col 7:string)(children: VectorCoalesce(columns [5, 6])(children: CastLongToString(col 0:int) -> 5:string, ConstantVectorExpression(val ) -> 6:string) -> 7:string)
+ predicate: COALESCE(id,'') is not null (type: boolean)
+ Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: id (type: int), value (type: string), date_string (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 1, 2]
+ Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 COALESCE(_col0,'') (type: string)
+ 1 _col0 (type: string)
+ Map Join Vectorization:
+ bigTableKeyColumns: 8:string
+ bigTableKeyExpressions: VectorCoalesce(columns [5, 7])(children: CastLongToString(col 0:int) -> 5:string, ConstantVectorExpression(val ) -> 7:string) -> 8:string
+ bigTableRetainColumnNums: [1, 2]
+ bigTableValueColumns: 1:string, 2:string
+ className: VectorMapJoinLeftSemiStringOperator
+ native: true
+ nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS true
+ nonOuterSmallTableKeyMapping: []
+ projectedOutput: 1:string, 2:string
+ hashTableImplementationType: OPTIMIZED
+ outputColumnNames: _col1, _col2
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: if((_col1) IN ('TermDeposit', 'RecurringDeposit', 'CertificateOfDeposit'), COALESCE(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'),' '), '') (type: string)
+ outputColumnNames: _col0
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [14]
+ selectExpressions: IfExprCondExprColumn(col 9:boolean, col 13:string, col 5:string)(children: StringColumnInList(col 1, values TermDeposit, RecurringDeposit, CertificateOfDeposit) -> 9:boolean, VectorCoalesce(columns [5, 12])(children: VectorUDFAdaptor(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'))(children: VectorUDFUnixTimeStampDate(col 10)(children: CastStringToDate(col 2:string) -> 10:date) -> 11:bigint) -> 5:string, ConstantVector [...]
+ Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats: org.apache.hadoop.mapred.TextInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: true
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 3
+ includeColumns: [0, 1, 2]
+ dataColumns: id:int, value:string, date_string:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [string, string, string, string, bigint, bigint, bigint, string, string, string]
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: _dummy_table
+ Row Limit Per Split: 1
+ Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: split('8800', ',') (type: array<string>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+ UDTF Operator
+ Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+ function name: explode
+ Filter Operator
+ predicate: col is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: col (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ keys: _col0 (type: string)
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: llap
+ LLAP IO: no inputs
+ Map Vectorization:
+ enabled: false
+ enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+ enabledConditionsNotMet: Could not enable vectorization due to partition column names size 1 is greater than the number of table column names size 0 IS false
+ inputFileFormats: org.apache.hadoop.hive.ql.io.NullRowsInputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ )
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+ CASE WHEN scratch_col_issue.value in (
+ 'TermDeposit', 'RecurringDeposit',
+ 'CertificateOfDeposit'
+ ) THEN NVL(
+ (
+ from_unixtime(
+ unix_timestamp(
+ cast(scratch_col_issue.date_string as date)
+ ),
+ 'MM-dd-yyyy'
+ )
+ ),
+ ' '
+ ) ELSE '' END AS MAT_DTE
+FROM
+ scratch_col_issue
+WHERE
+ NVL(scratch_col_issue.id, '') IN (
+ SELECT
+ EXPLODE(
+ SPLIT('8800', ',')
+ )
+ )
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@scratch_col_issue
+#### A masked pattern was here ####
+
+