You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kr...@apache.org on 2023/04/17 06:10:12 UTC
[hive] branch master updated: HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by Krisztian Kasa)

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new f6732333fa9 HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by Krisztian Kasa)
f6732333fa9 is described below

commit f6732333fa9dae07b7b09eaf76241ea69f2492db
Author: Dayakar M <59...@users.noreply.github.com>
AuthorDate: Mon Apr 17 11:39:59 2023 +0530

    HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by Krisztian Kasa)
---
 .../test/resources/testconfiguration.properties    |   1 +
 .../hive/ql/optimizer/ReduceSinkMapJoinProc.java   |   3 +-
 .../clientpositive/bucketmapjoin_with_subquery.q   |  52 +++
 .../tez/bucketmapjoin_with_subquery.q.out          | 355 +++++++++++++++++++++
 4 files changed, 410 insertions(+), 1 deletion(-)

diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 46c824bccac..b85c50bc098 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -17,6 +17,7 @@ minitez.query.files.shared=\
 # specific to tez and cannot be added to minillap.
 minitez.query.files=\
   acid_vectorization_original_tez.q,\
+  bucketmapjoin_with_subquery.q,\
   delete_orig_table.q,\
   explainanalyze_1.q,\
   explainanalyze_3.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
index bd6c41819d5..ea192e8af9a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
@@ -277,7 +277,8 @@ public class ReduceSinkMapJoinProc implements SemanticNodeProcessor {
         edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;
       }
     }
-    if (edgeType == EdgeType.CUSTOM_EDGE) {
+    if (edgeType == EdgeType.CUSTOM_EDGE || (edgeType == EdgeType.CUSTOM_SIMPLE_EDGE && !mapJoinOp.getConf()
+        .isDynamicPartitionHashJoin())) {
       // disable auto parallelism for bucket map joins
       parentRS.getConf().setReducerTraits(EnumSet.of(FIXED));
     }
diff --git a/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q b/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q
new file mode 100644
index 00000000000..dd7274014c8
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q
@@ -0,0 +1,52 @@
+-- HIVE-27069
+set hive.query.results.cache.enabled=false;
+set hive.compute.query.using.stats=false;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.auto.convert.join=true;
+set hive.auto.convert.sortmerge.join=true;
+set hive.auto.convert.sortmerge.join.to.mapjoin=true;
+set hive.convert.join.bucket.mapjoin.tez=true;
+set hive.fetch.task.conversion=none;
+set hive.merge.nway.joins=false;
+set hive.optimize.dynamic.partition.hashjoin=true;
+set hive.optimize.index.filter=true;
+set hive.optimize.remove.sq_count_check=true;
+set hive.prewarm.enabled=false;
+set hive.join.inner.residual=false;
+set hive.limit.optimize.enable=true;
+set hive.mapjoin.bucket.cache.size=10000;
+set hive.strict.managed.tables=true;
+set hive.tez.auto.reducer.parallelism=true;
+set hive.tez.bucket.pruning=true;
+
+CREATE TABLE DUP_TEST (id int , in_date timestamp , sample varchar(100)) stored as orc tblproperties('transactional'='true', 'transactional_properties'='default');
+
+CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , sample varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC tblproperties('transactional'='true', 'transactional_properties'='default');
+
+INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9');
+
+-- Run merge into the target table for the first time
+MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample);
+
+explain vectorization detail select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id;
+
+select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id;
+
+select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out b/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out
new file mode 100644
index 00000000000..7ce5bb945e8
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out
@@ -0,0 +1,355 @@
+PREHOOK: query: CREATE TABLE DUP_TEST (id int , in_date timestamp , sample varchar(100)) stored as orc tblproperties('transactional'='true', 'transactional_properties'='default')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@DUP_TEST
+POSTHOOK: query: CREATE TABLE DUP_TEST (id int , in_date timestamp , sample varchar(100)) stored as orc tblproperties('transactional'='true', 'transactional_properties'='default')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@DUP_TEST
+PREHOOK: query: CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , sample varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC tblproperties('transactional'='true', 'transactional_properties'='default')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@DUP_TEST_TARGET
+POSTHOOK: query: CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , sample varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC tblproperties('transactional'='true', 'transactional_properties'='default')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@DUP_TEST_TARGET
+PREHOOK: query: INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@dup_test
+POSTHOOK: query: INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@dup_test
+POSTHOOK: Lineage: dup_test.id SCRIPT []
+POSTHOOK: Lineage: dup_test.in_date SCRIPT []
+POSTHOOK: Lineage: dup_test.sample SCRIPT []
+PREHOOK: query: MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: default@dup_test_target
+PREHOOK: Output: default@dup_test_target
+PREHOOK: Output: default@merge_tmp_table
+POSTHOOK: query: MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: default@dup_test_target
+POSTHOOK: Output: default@dup_test_target
+POSTHOOK: Output: default@merge_tmp_table
+POSTHOOK: Lineage: dup_test_target.id SIMPLE [(dup_test)dup_test.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.id SIMPLE [(dup_test)dup_test.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.in_date SIMPLE [(dup_test)dup_test.FieldSchema(name:in_date, type:timestamp, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.in_date SIMPLE [(dup_test)dup_test.FieldSchema(name:in_date, type:timestamp, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.sample SIMPLE [(dup_test)dup_test.FieldSchema(name:sample, type:varchar(100), comment:null), ]
+POSTHOOK: Lineage: dup_test_target.sample SIMPLE [(dup_test)dup_test.FieldSchema(name:sample, type:varchar(100), comment:null), ]
+POSTHOOK: Lineage: merge_tmp_table.val EXPRESSION [(dup_test_target)dup_test_target.FieldSchema(name:ROW__ID, type:struct<writeId:bigint,bucketId:int,rowId:bigint>, comment:), ]
+PREHOOK: query: explain vectorization detail select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain vectorization detail select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 3 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 9 Data size: 2052 Basic stats: COMPLETE Column stats: NONE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:id:int, 1:in_date:timestamp, 2:sample:varchar(100), 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 4:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 0:int)
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 9 Data size: 2052 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), in_date (type: timestamp), sample (type: varchar(100))
+                      outputColumnNames: _col0, _col1, _col2
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1, 2]
+                      Statistics: Num rows: 9 Data size: 2052 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Reduce Sink Vectorization:
+                            className: VectorReduceSinkObjectHashOperator
+                            keyColumns: 0:int
+                            native: true
+                            nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                            partitionColumns: 0:int
+                            valueColumns: 1:timestamp, 2:varchar(100)
+                        Statistics: Num rows: 9 Data size: 2052 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: timestamp), _col2 (type: varchar(100))
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: true
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    includeColumns: [0, 1, 2]
+                    dataColumns: id:int, in_date:timestamp, sample:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: []
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: dup_test
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:id:int, 1:in_date:timestamp, 2:sample:varchar(100), 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 4:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 0:int)
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: id (type: int), in_date (type: timestamp)
+                      null sort order: aa
+                      sort order: +-
+                      Map-reduce partition columns: id (type: int)
+                      Reduce Sink Vectorization:
+                          className: VectorReduceSinkObjectHashOperator
+                          keyColumns: 0:int, 1:timestamp
+                          native: true
+                          nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                          partitionColumns: 0:int
+                          valueColumns: 2:varchar(100)
+                      Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE Column stats: COMPLETE
+                      value expressions: sample (type: varchar(100))
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: true
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    includeColumns: [0, 1, 2]
+                    dataColumns: id:int, in_date:timestamp, sample:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: []
+        Reducer 3 
+            Execution mode: vectorized
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez] IS true
+                reduceColumnNullOrder: aa
+                reduceColumnSortOrder: +-
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    dataColumns: KEY.reducesinkkey0:int, KEY.reducesinkkey1:timestamp, VALUE._col0:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint, timestamp, string]
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: timestamp), VALUE._col0 (type: varchar(100))
+                outputColumnNames: _col0, _col1, _col2
+                Select Vectorization:
+                    className: VectorSelectOperator
+                    native: true
+                    projectedOutputColumnNums: [0, 1, 2]
+                Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE Column stats: COMPLETE
+                PTF Operator
+                  Function definitions:
+                      Input definition
+                        input alias: ptf_0
+                        output shape: _col0: int, _col1: timestamp, _col2: varchar(100)
+                        type: WINDOWING
+                      Windowing table definition
+                        input alias: ptf_1
+                        name: windowingtablefunction
+                        order by: _col1 DESC NULLS FIRST
+                        partition by: _col0
+                        raw input shape:
+                        window functions:
+                            window function definition
+                              alias: ROW_NUMBER_window_0
+                              name: ROW_NUMBER
+                              window function: GenericUDAFRowNumberEvaluator
+                              window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX)
+                              isPivotResult: true
+                  PTF Vectorization:
+                      allEvaluatorsAreStreaming: true
+                      className: VectorPTFOperator
+                      evaluatorClasses: [VectorPTFEvaluatorRowNumber]
+                      functionInputExpressions: [null]
+                      functionNames: [ROW_NUMBER]
+                      keyInputColumns: [0, 1]
+                      native: true
+                      nonKeyInputColumns: [2]
+                      orderExpressions: [col 1:timestamp]
+                      outputColumns: [3, 0, 1, 2]
+                      outputTypes: [int, int, timestamp, varchar(100)]
+                      partitionExpressions: [col 0:int]
+                      streamingColumns: [3]
+                  Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: FilterLongColEqualLongScalar(col 3:int, val 1)
+                    predicate: (ROW_NUMBER_window_0 = 1) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 532 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: _col0 (type: int), _col1 (type: timestamp), _col2 (type: varchar(100))
+                      outputColumnNames: _col0, _col1, _col2
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1, 2]
+                      Statistics: Num rows: 4 Data size: 532 Basic stats: COMPLETE Column stats: COMPLETE
+                      Map Join Operator
+                        condition map:
+                             Inner Join 0 to 1
+                        keys:
+                          0 _col0 (type: int)
+                          1 _col0 (type: int)
+                        Map Join Vectorization:
+                            bigTableKeyColumns: 0:int
+                            bigTableRetainColumnNums: [0, 1, 2]
+                            bigTableValueColumns: 0:int, 1:timestamp, 2:varchar(100)
+                            className: VectorMapJoinInnerLongOperator
+                            native: true
+                            nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS true
+                            nonOuterSmallTableKeyMapping: []
+                            projectedOutput: 0:int, 4:timestamp, 5:varchar(100), 0:int, 1:timestamp, 2:varchar(100)
+                            smallTableValueMapping: 4:timestamp, 5:varchar(100)
+                            hashTableImplementationType: OPTIMIZED
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                        input vertices:
+                          0 Map 1
+                        Statistics: Num rows: 9 Data size: 2257 Basic stats: COMPLETE Column stats: NONE
+                        File Output Operator
+                          compressed: false
+                          File Sink Vectorization:
+                              className: VectorFileSinkOperator
+                              native: false
+                          Statistics: Num rows: 9 Data size: 2257 Basic stats: COMPLETE Column stats: NONE
+                          table:
+                              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2	2023-04-14 10:11:12.111	test2	2	2023-04-14 10:11:12.111	test2
+3	2023-04-14 10:11:12.111	test3	3	2023-04-14 10:11:12.111	test3
+6	2023-04-14 10:11:12.111	test6	6	2023-04-14 10:11:12.111	test6
+7	2023-04-14 10:11:12.111	test7	7	2023-04-14 10:11:12.111	test7
+1	2023-04-14 10:11:12.111	test1	1	2023-04-14 10:11:12.111	test1
+4	2023-04-14 10:11:12.111	test4	4	2023-04-14 10:11:12.111	test4
+5	2023-04-14 10:11:12.111	test5	5	2023-04-14 10:11:12.111	test5
+8	2023-04-14 10:11:12.111	test8	8	2023-04-14 10:11:12.111	test8
+9	2023-04-14 10:11:12.111	test9	9	2023-04-14 10:11:12.111	test9
+PREHOOK: query: select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1	2023-04-14 10:11:12.111	test1	1	2023-04-14 10:11:12.111	test1
+8	2023-04-14 10:11:12.111	test8	8	2023-04-14 10:11:12.111	test8
+4	2023-04-14 10:11:12.111	test4	4	2023-04-14 10:11:12.111	test4
+5	2023-04-14 10:11:12.111	test5	5	2023-04-14 10:11:12.111	test5
+3	2023-04-14 10:11:12.111	test3	3	2023-04-14 10:11:12.111	test3
+7	2023-04-14 10:11:12.111	test7	7	2023-04-14 10:11:12.111	test7
+9	2023-04-14 10:11:12.111	test9	9	2023-04-14 10:11:12.111	test9
+2	2023-04-14 10:11:12.111	test2	2	2023-04-14 10:11:12.111	test2
+6	2023-04-14 10:11:12.111	test6	6	2023-04-14 10:11:12.111	test6