You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2018/07/23 22:56:39 UTC

hive git commit: HIVE-19891 : inserting into external tables with custom partition directories may cause data loss (Sergey Shelukhin, reviewed by Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master 5e7aa0986 -> 4e9562f1e


HIVE-19891 : inserting into external tables with custom partition directories may cause data loss (Sergey Shelukhin, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4e9562f1
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4e9562f1
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4e9562f1

Branch: refs/heads/master
Commit: 4e9562f1e7a0baeae6b5e0ead8f54a43f3196f5b
Parents: 5e7aa09
Author: sergey <se...@apache.org>
Authored: Mon Jul 23 15:55:55 2018 -0700
Committer: sergey <se...@apache.org>
Committed: Mon Jul 23 15:56:25 2018 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hive/ql/exec/MoveTask.java    |   2 +-
 .../apache/hadoop/hive/ql/metadata/Hive.java    |  22 ++-
 .../hive/ql/parse/DDLSemanticAnalyzer.java      |   1 +
 .../hadoop/hive/ql/plan/LoadTableDesc.java      |  10 +-
 .../queries/clientpositive/external_insert.q    |  14 ++
 .../clientpositive/external_insert.q.out        | 158 +++++++++++++++++++
 6 files changed, 198 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
index bf7749d..322207d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
@@ -482,7 +482,7 @@ public class MoveTask extends Task<MoveWork> implements Serializable {
 
     db.loadPartition(tbd.getSourcePath(), db.getTable(tbd.getTable().getTableName()),
         tbd.getPartitionSpec(), tbd.getLoadFileType(), tbd.getInheritTableSpecs(),
-        isSkewedStoredAsDirs(tbd), work.isSrcLocal(),
+        tbd.getInheritLocation(), isSkewedStoredAsDirs(tbd), work.isSrcLocal(),
          work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID &&
             !tbd.isMmTable(),
          hasFollowingStatsTask(),

http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 1fe1fb6..fdb4fa2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -1717,7 +1717,8 @@ public class Hive {
    * @return Partition object being loaded with data
    */
   public Partition loadPartition(Path loadPath, Table tbl, Map<String, String> partSpec,
-      LoadFileType loadFileType, boolean inheritTableSpecs, boolean isSkewedStoreAsSubdir,
+      LoadFileType loadFileType, boolean inheritTableSpecs, boolean inheritLocation,
+      boolean isSkewedStoreAsSubdir,
       boolean isSrcLocal, boolean isAcidIUDoperation, boolean hasFollowingStatsTask, Long writeId,
       int stmtId, boolean isInsertOverwrite) throws HiveException {
     Path tblDataLocationPath =  tbl.getDataLocation();
@@ -1741,10 +1742,8 @@ public class Hive {
       Path oldPartPath = (oldPart != null) ? oldPart.getDataLocation() : null;
       Path newPartPath = null;
 
-      if (inheritTableSpecs) {
-        Path partPath = new Path(tbl.getDataLocation(), Warehouse.makePartPath(partSpec));
-        newPartPath = new Path(tblDataLocationPath.toUri().getScheme(),
-            tblDataLocationPath.toUri().getAuthority(), partPath.toUri().getPath());
+      if (inheritLocation) {
+        newPartPath = genPartPathFromTable(tbl, partSpec, tblDataLocationPath);
 
         if(oldPart != null) {
           /*
@@ -1761,7 +1760,8 @@ public class Hive {
           }
         }
       } else {
-        newPartPath = oldPartPath;
+        newPartPath = oldPartPath == null
+          ? newPartPath = genPartPathFromTable(tbl, partSpec, tblDataLocationPath) : oldPartPath;
       }
       List<Path> newFiles = Collections.synchronizedList(new ArrayList<Path>());
 
@@ -1940,6 +1940,14 @@ public class Hive {
     }
   }
 
+
+  private static Path genPartPathFromTable(Table tbl, Map<String, String> partSpec,
+      Path tblDataLocationPath) throws MetaException {
+    Path partPath = new Path(tbl.getDataLocation(), Warehouse.makePartPath(partSpec));
+    return new Path(tblDataLocationPath.toUri().getScheme(),
+        tblDataLocationPath.toUri().getAuthority(), partPath.toUri().getPath());
+  }
+
   /**
    * Load Data commands for fullAcid tables write to base_x (if there is overwrite clause) or
    * delta_x_x directory - same as any other Acid write.  This method modifies the destPath to add
@@ -2262,7 +2270,7 @@ private void constructOneLBLocationMap(FileStatus fSta,
 
               // load the partition
               Partition newPartition = loadPartition(partPath, tbl, fullPartSpec, loadFileType,
-                  true, numLB > 0, false, isAcid, hasFollowingStatsTask, writeId, stmtId,
+                  true, false, numLB > 0, false, isAcid, hasFollowingStatsTask, writeId, stmtId,
                   isInsertOverwrite);
               partitionsMap.put(fullPartSpec, newPartition);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
index 9373df6..2007e13 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
@@ -2094,6 +2094,7 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer {
       LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc,
           partSpec == null ? new HashMap<>() : partSpec);
       ltd.setLbCtx(lbCtx);
+      ltd.setInheritTableSpecs(true);
       Task<MoveWork> moveTsk =
           TaskFactory.get(new MoveWork(null, null, ltd, null, false));
       mergeTask.addDependentTask(moveTsk);

http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
index f15b3c3..af2ece4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
@@ -38,6 +38,7 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
   private ListBucketingCtx lbCtx;
   private boolean inheritTableSpecs = true; //For partitions, flag controlling whether the current
                                             //table specs are to be used
+  private boolean inheritLocation = false; // A silly setting.
   private int stmtId;
   private Long currentWriteId;
   private boolean isInsertOverwrite;
@@ -71,6 +72,7 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
     this.dpCtx = o.dpCtx;
     this.lbCtx = o.lbCtx;
     this.inheritTableSpecs = o.inheritTableSpecs;
+    this.inheritLocation = o.inheritLocation;
     this.currentWriteId = o.currentWriteId;
     this.table = o.table;
     this.partitionSpec = o.partitionSpec;
@@ -207,8 +209,14 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
     return inheritTableSpecs;
   }
 
+  public boolean getInheritLocation() {
+    return inheritLocation;
+  }
+
   public void setInheritTableSpecs(boolean inheritTableSpecs) {
-    this.inheritTableSpecs = inheritTableSpecs;
+    // Set inheritLocation if this is set to true explicitly.
+    // TODO: Who actually needs this? Might just be some be pointless legacy code.
+    this.inheritTableSpecs = inheritLocation = inheritTableSpecs;
   }
 
   public boolean isInsertOverwrite() {

http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/test/queries/clientpositive/external_insert.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/external_insert.q b/ql/src/test/queries/clientpositive/external_insert.q
new file mode 100644
index 0000000..9a62609
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/external_insert.q
@@ -0,0 +1,14 @@
+drop table tbl1;
+
+-- tbl1 is only used to create a directory with data
+CREATE TABLE tbl1 (index int, value int) LOCATION 'file:${system:test.tmp.dir}/external_insert';
+insert into tbl1 VALUES (2, 2);
+
+CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string );
+ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01');
+ALTER TABLE tbl2 PARTITION(created_date='2018-02-01') SET LOCATION 'file:${system:test.tmp.dir}/external_insert';
+select * from tbl2;
+describe formatted tbl2 partition(created_date='2018-02-01');
+insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1);
+select * from tbl2;
+describe formatted tbl2 partition(created_date='2018-02-01');

http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/test/results/clientpositive/external_insert.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/external_insert.q.out b/ql/src/test/results/clientpositive/external_insert.q.out
new file mode 100644
index 0000000..fbec406
--- /dev/null
+++ b/ql/src/test/results/clientpositive/external_insert.q.out
@@ -0,0 +1,158 @@
+PREHOOK: query: drop table tbl1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table tbl1
+POSTHOOK: type: DROPTABLE
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl1
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl1
+PREHOOK: query: insert into tbl1 VALUES (2, 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl1
+POSTHOOK: query: insert into tbl1 VALUES (2, 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl1
+POSTHOOK: Lineage: tbl1.index SCRIPT []
+POSTHOOK: Lineage: tbl1.value SCRIPT []
+PREHOOK: query: CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string )
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl2
+POSTHOOK: query: CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string )
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl2
+PREHOOK: query: ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@tbl2
+POSTHOOK: query: ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@tbl2
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+PREHOOK: type: ALTERPARTITION_LOCATION
+PREHOOK: Input: default@tbl2
+PREHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERPARTITION_LOCATION
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+PREHOOK: query: select * from tbl2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: query: select * from tbl2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+2	2	2018-02-01
+PREHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl2
+POSTHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl2
+# col_name            	data_type           	comment             
+index               	int                 	                    
+value               	int                 	                    
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+created_date        	string              	                    
+	 	 
+# Detailed Partition Information	 	 
+Partition Value:    	[2018-02-01]        	 
+Database:           	default             	 
+Table:              	tbl2                	 
+#### A masked pattern was here ####
+Partition Parameters:	 	 
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"index\":\"true\",\"value\":\"true\"}}
+#### A masked pattern was here ####
+	numFiles            	0                   
+	numRows             	0                   
+	rawDataSize         	0                   
+	totalSize           	0                   
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
+PREHOOK: query: insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl2@created_date=2018-02-01
+POSTHOOK: query: insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+POSTHOOK: Lineage: tbl2 PARTITION(created_date=2018-02-01).index SCRIPT []
+POSTHOOK: Lineage: tbl2 PARTITION(created_date=2018-02-01).value SCRIPT []
+PREHOOK: query: select * from tbl2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: query: select * from tbl2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+2	2	2018-02-01
+1	1	2018-02-01
+PREHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl2
+POSTHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl2
+# col_name            	data_type           	comment             
+index               	int                 	                    
+value               	int                 	                    
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+created_date        	string              	                    
+	 	 
+# Detailed Partition Information	 	 
+Partition Value:    	[2018-02-01]        	 
+Database:           	default             	 
+Table:              	tbl2                	 
+#### A masked pattern was here ####
+Partition Parameters:	 	 
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"index\":\"true\",\"value\":\"true\"}}
+#### A masked pattern was here ####
+	numFiles            	2                   
+	numRows             	1                   
+	rawDataSize         	3                   
+	totalSize           	8                   
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1