You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kr...@apache.org on 2022/02/09 16:51:19 UTC

[hive] branch master updated: HIVE-25918: Invalid stats after multi inserting into the same partition (Krisztian Kasa, reviewed by Stamatis Zampetakis)

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e7c051  HIVE-25918: Invalid stats after multi inserting into the same partition (Krisztian Kasa, reviewed by Stamatis Zampetakis)
0e7c051 is described below

commit 0e7c051c01f7290033cd9a630cf5c94281d244f6
Author: Krisztian Kasa <ka...@gmail.com>
AuthorDate: Wed Feb 9 17:50:33 2022 +0100

    HIVE-25918: Invalid stats after multi inserting into the same partition (Krisztian Kasa, reviewed by Stamatis Zampetakis)
---
 .../hadoop/hive/ql/stats/BasicStatsTask.java       |  24 ++---
 .../clientpositive/stats_part_multi_insert.q       |  19 ++++
 .../llap/stats_part_multi_insert.q.out             | 116 +++++++++++++++++++++
 3 files changed, 145 insertions(+), 14 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java
index 5b005ae..c04d5c8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java
@@ -29,6 +29,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.stream.Collectors;
 
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
@@ -46,7 +47,6 @@ import org.apache.hadoop.hive.ql.ErrorMsg;
 import org.apache.hadoop.hive.ql.exec.Task;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.io.AcidUtils;
-import org.apache.hadoop.hive.ql.lockmgr.HiveTxnManager;
 import org.apache.hadoop.hive.ql.metadata.Hive;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.metadata.Partition;
@@ -64,6 +64,9 @@ import org.slf4j.LoggerFactory;
 import com.google.common.collect.Lists;
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 
+import static java.util.Collections.emptyList;
+import static java.util.Collections.singletonList;
+import static java.util.Collections.unmodifiableList;
 import static org.apache.hadoop.hive.common.StatsSetupConst.DELETE_COUNT;
 import static org.apache.hadoop.hive.common.StatsSetupConst.INSERT_COUNT;
 import static org.apache.hadoop.hive.common.StatsSetupConst.UPDATE_COUNT;
@@ -481,8 +484,6 @@ public class BasicStatsTask implements Serializable, IStatsProcessor {
       return null; //we are in CTAS, so we know there are no partitions
     }
 
-    List<Partition> list = new ArrayList<Partition>();
-
     if (work.getTableSpecs() != null) {
 
       // ANALYZE command
@@ -492,12 +493,7 @@ public class BasicStatsTask implements Serializable, IStatsProcessor {
         return null;
       }
       // get all partitions that matches with the partition spec
-      List<Partition> partitions = tblSpec.partitions;
-      if (partitions != null) {
-        for (Partition partn : partitions) {
-          list.add(partn);
-        }
-      }
+      return tblSpec.partitions != null ? unmodifiableList(tblSpec.partitions) : emptyList();
     } else if (work.getLoadTableDesc() != null) {
 
       // INSERT OVERWRITE command
@@ -510,15 +506,15 @@ public class BasicStatsTask implements Serializable, IStatsProcessor {
       if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
         // If no dynamic partitions are generated, dpPartSpecs may not be initialized
         if (dpPartSpecs != null) {
-          // load the list of DP partitions and return the list of partition specs
-          list.addAll(dpPartSpecs);
+          // Reload partition metadata because another BasicStatsTask instance may have updated the stats.
+          List<String> partNames = dpPartSpecs.stream().map(Partition::getName).collect(Collectors.toList());
+          return db.getPartitionsByNames(table, partNames);
         }
       } else { // static partition
-        Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
-        list.add(partn);
+        return singletonList(db.getPartition(table, tbd.getPartitionSpec(), false));
       }
     }
-    return list;
+    return emptyList();
   }
 
   public Collection<Partition> getDpPartSpecs() {
diff --git a/ql/src/test/queries/clientpositive/stats_part_multi_insert.q b/ql/src/test/queries/clientpositive/stats_part_multi_insert.q
new file mode 100644
index 0000000..1ab3e06
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stats_part_multi_insert.q
@@ -0,0 +1,19 @@
+-- Test multi inserting to the same partition
+
+create table source(p int, key int,value string);
+insert into source(p, key, value) values (101,42,'string42');
+
+create table stats_part(key int,value string) partitioned by (p int);
+
+from source
+insert into stats_part select key, value, p
+insert into stats_part select key, value, p;
+
+select p, key, value from stats_part;
+desc formatted stats_part;
+
+set hive.compute.query.using.stats=true;
+select count(*) from stats_part;
+
+set hive.compute.query.using.stats=false;
+select count(*) from stats_part;
diff --git a/ql/src/test/results/clientpositive/llap/stats_part_multi_insert.q.out b/ql/src/test/results/clientpositive/llap/stats_part_multi_insert.q.out
new file mode 100644
index 0000000..4506881
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/stats_part_multi_insert.q.out
@@ -0,0 +1,116 @@
+PREHOOK: query: create table source(p int, key int,value string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@source
+POSTHOOK: query: create table source(p int, key int,value string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@source
+PREHOOK: query: insert into source(p, key, value) values (101,42,'string42')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source(p, key, value) values (101,42,'string42')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.key SCRIPT []
+POSTHOOK: Lineage: source.p SCRIPT []
+POSTHOOK: Lineage: source.value SCRIPT []
+PREHOOK: query: create table stats_part(key int,value string) partitioned by (p int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@stats_part
+POSTHOOK: query: create table stats_part(key int,value string) partitioned by (p int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@stats_part
+PREHOOK: query: from source
+insert into stats_part select key, value, p
+insert into stats_part select key, value, p
+PREHOOK: type: QUERY
+PREHOOK: Input: default@source
+PREHOOK: Output: default@stats_part
+POSTHOOK: query: from source
+insert into stats_part select key, value, p
+insert into stats_part select key, value, p
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@source
+POSTHOOK: Output: default@stats_part
+POSTHOOK: Output: default@stats_part@p=101
+POSTHOOK: Lineage: stats_part PARTITION(p=101).key SIMPLE [(source)source.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: stats_part PARTITION(p=101).value SIMPLE [(source)source.FieldSchema(name:value, type:string, comment:null), ]
+POSTHOOK: Lineage: stats_part PARTITION(p=101).key SIMPLE [(source)source.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: stats_part PARTITION(p=101).value SIMPLE [(source)source.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: select p, key, value from stats_part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@stats_part
+PREHOOK: Input: default@stats_part@p=101
+#### A masked pattern was here ####
+POSTHOOK: query: select p, key, value from stats_part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@stats_part
+POSTHOOK: Input: default@stats_part@p=101
+#### A masked pattern was here ####
+101	42	string42
+101	42	string42
+PREHOOK: query: desc formatted stats_part
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@stats_part
+POSTHOOK: query: desc formatted stats_part
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@stats_part
+# col_name            	data_type           	comment             
+key                 	int                 	                    
+value               	string              	                    
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+p                   	int                 	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+#### A masked pattern was here ####
+Retention:          	0                   	 
+#### A masked pattern was here ####
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
+	bucketing_version   	2                   
+	numFiles            	2                   
+	numPartitions       	1                   
+	numRows             	2                   
+	rawDataSize         	22                  
+	totalSize           	24                  
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
+PREHOOK: query: select count(*) from stats_part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@stats_part
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from stats_part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@stats_part
+#### A masked pattern was here ####
+2
+PREHOOK: query: select count(*) from stats_part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@stats_part
+PREHOOK: Input: default@stats_part@p=101
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from stats_part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@stats_part
+POSTHOOK: Input: default@stats_part@p=101
+#### A masked pattern was here ####
+2