You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2018/07/23 22:56:39 UTC
hive git commit: HIVE-19891 : inserting into external tables with
custom partition directories may cause data loss (Sergey Shelukhin,
reviewed by Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 5e7aa0986 -> 4e9562f1e
HIVE-19891 : inserting into external tables with custom partition directories may cause data loss (Sergey Shelukhin, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4e9562f1
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4e9562f1
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4e9562f1
Branch: refs/heads/master
Commit: 4e9562f1e7a0baeae6b5e0ead8f54a43f3196f5b
Parents: 5e7aa09
Author: sergey <se...@apache.org>
Authored: Mon Jul 23 15:55:55 2018 -0700
Committer: sergey <se...@apache.org>
Committed: Mon Jul 23 15:56:25 2018 -0700
----------------------------------------------------------------------
.../apache/hadoop/hive/ql/exec/MoveTask.java | 2 +-
.../apache/hadoop/hive/ql/metadata/Hive.java | 22 ++-
.../hive/ql/parse/DDLSemanticAnalyzer.java | 1 +
.../hadoop/hive/ql/plan/LoadTableDesc.java | 10 +-
.../queries/clientpositive/external_insert.q | 14 ++
.../clientpositive/external_insert.q.out | 158 +++++++++++++++++++
6 files changed, 198 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
index bf7749d..322207d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
@@ -482,7 +482,7 @@ public class MoveTask extends Task<MoveWork> implements Serializable {
db.loadPartition(tbd.getSourcePath(), db.getTable(tbd.getTable().getTableName()),
tbd.getPartitionSpec(), tbd.getLoadFileType(), tbd.getInheritTableSpecs(),
- isSkewedStoredAsDirs(tbd), work.isSrcLocal(),
+ tbd.getInheritLocation(), isSkewedStoredAsDirs(tbd), work.isSrcLocal(),
work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID &&
!tbd.isMmTable(),
hasFollowingStatsTask(),
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 1fe1fb6..fdb4fa2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -1717,7 +1717,8 @@ public class Hive {
* @return Partition object being loaded with data
*/
public Partition loadPartition(Path loadPath, Table tbl, Map<String, String> partSpec,
- LoadFileType loadFileType, boolean inheritTableSpecs, boolean isSkewedStoreAsSubdir,
+ LoadFileType loadFileType, boolean inheritTableSpecs, boolean inheritLocation,
+ boolean isSkewedStoreAsSubdir,
boolean isSrcLocal, boolean isAcidIUDoperation, boolean hasFollowingStatsTask, Long writeId,
int stmtId, boolean isInsertOverwrite) throws HiveException {
Path tblDataLocationPath = tbl.getDataLocation();
@@ -1741,10 +1742,8 @@ public class Hive {
Path oldPartPath = (oldPart != null) ? oldPart.getDataLocation() : null;
Path newPartPath = null;
- if (inheritTableSpecs) {
- Path partPath = new Path(tbl.getDataLocation(), Warehouse.makePartPath(partSpec));
- newPartPath = new Path(tblDataLocationPath.toUri().getScheme(),
- tblDataLocationPath.toUri().getAuthority(), partPath.toUri().getPath());
+ if (inheritLocation) {
+ newPartPath = genPartPathFromTable(tbl, partSpec, tblDataLocationPath);
if(oldPart != null) {
/*
@@ -1761,7 +1760,8 @@ public class Hive {
}
}
} else {
- newPartPath = oldPartPath;
+ newPartPath = oldPartPath == null
+ ? newPartPath = genPartPathFromTable(tbl, partSpec, tblDataLocationPath) : oldPartPath;
}
List<Path> newFiles = Collections.synchronizedList(new ArrayList<Path>());
@@ -1940,6 +1940,14 @@ public class Hive {
}
}
+
+ private static Path genPartPathFromTable(Table tbl, Map<String, String> partSpec,
+ Path tblDataLocationPath) throws MetaException {
+ Path partPath = new Path(tbl.getDataLocation(), Warehouse.makePartPath(partSpec));
+ return new Path(tblDataLocationPath.toUri().getScheme(),
+ tblDataLocationPath.toUri().getAuthority(), partPath.toUri().getPath());
+ }
+
/**
* Load Data commands for fullAcid tables write to base_x (if there is overwrite clause) or
* delta_x_x directory - same as any other Acid write. This method modifies the destPath to add
@@ -2262,7 +2270,7 @@ private void constructOneLBLocationMap(FileStatus fSta,
// load the partition
Partition newPartition = loadPartition(partPath, tbl, fullPartSpec, loadFileType,
- true, numLB > 0, false, isAcid, hasFollowingStatsTask, writeId, stmtId,
+ true, false, numLB > 0, false, isAcid, hasFollowingStatsTask, writeId, stmtId,
isInsertOverwrite);
partitionsMap.put(fullPartSpec, newPartition);
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
index 9373df6..2007e13 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
@@ -2094,6 +2094,7 @@ public class DDLSemanticAnalyzer extends BaseSemanticAnalyzer {
LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc,
partSpec == null ? new HashMap<>() : partSpec);
ltd.setLbCtx(lbCtx);
+ ltd.setInheritTableSpecs(true);
Task<MoveWork> moveTsk =
TaskFactory.get(new MoveWork(null, null, ltd, null, false));
mergeTask.addDependentTask(moveTsk);
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
index f15b3c3..af2ece4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
@@ -38,6 +38,7 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
private ListBucketingCtx lbCtx;
private boolean inheritTableSpecs = true; //For partitions, flag controlling whether the current
//table specs are to be used
+ private boolean inheritLocation = false; // A silly setting.
private int stmtId;
private Long currentWriteId;
private boolean isInsertOverwrite;
@@ -71,6 +72,7 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
this.dpCtx = o.dpCtx;
this.lbCtx = o.lbCtx;
this.inheritTableSpecs = o.inheritTableSpecs;
+ this.inheritLocation = o.inheritLocation;
this.currentWriteId = o.currentWriteId;
this.table = o.table;
this.partitionSpec = o.partitionSpec;
@@ -207,8 +209,14 @@ public class LoadTableDesc extends LoadDesc implements Serializable {
return inheritTableSpecs;
}
+ public boolean getInheritLocation() {
+ return inheritLocation;
+ }
+
public void setInheritTableSpecs(boolean inheritTableSpecs) {
- this.inheritTableSpecs = inheritTableSpecs;
+ // Set inheritLocation if this is set to true explicitly.
+ // TODO: Who actually needs this? Might just be some be pointless legacy code.
+ this.inheritTableSpecs = inheritLocation = inheritTableSpecs;
}
public boolean isInsertOverwrite() {
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/test/queries/clientpositive/external_insert.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/external_insert.q b/ql/src/test/queries/clientpositive/external_insert.q
new file mode 100644
index 0000000..9a62609
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/external_insert.q
@@ -0,0 +1,14 @@
+drop table tbl1;
+
+-- tbl1 is only used to create a directory with data
+CREATE TABLE tbl1 (index int, value int) LOCATION 'file:${system:test.tmp.dir}/external_insert';
+insert into tbl1 VALUES (2, 2);
+
+CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string );
+ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01');
+ALTER TABLE tbl2 PARTITION(created_date='2018-02-01') SET LOCATION 'file:${system:test.tmp.dir}/external_insert';
+select * from tbl2;
+describe formatted tbl2 partition(created_date='2018-02-01');
+insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1);
+select * from tbl2;
+describe formatted tbl2 partition(created_date='2018-02-01');
http://git-wip-us.apache.org/repos/asf/hive/blob/4e9562f1/ql/src/test/results/clientpositive/external_insert.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/external_insert.q.out b/ql/src/test/results/clientpositive/external_insert.q.out
new file mode 100644
index 0000000..fbec406
--- /dev/null
+++ b/ql/src/test/results/clientpositive/external_insert.q.out
@@ -0,0 +1,158 @@
+PREHOOK: query: drop table tbl1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table tbl1
+POSTHOOK: type: DROPTABLE
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl1
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl1
+PREHOOK: query: insert into tbl1 VALUES (2, 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl1
+POSTHOOK: query: insert into tbl1 VALUES (2, 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl1
+POSTHOOK: Lineage: tbl1.index SCRIPT []
+POSTHOOK: Lineage: tbl1.value SCRIPT []
+PREHOOK: query: CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string )
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl2
+POSTHOOK: query: CREATE external TABLE tbl2 (index int, value int ) PARTITIONED BY ( created_date string )
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl2
+PREHOOK: query: ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@tbl2
+POSTHOOK: query: ALTER TABLE tbl2 ADD PARTITION(created_date='2018-02-01')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@tbl2
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+PREHOOK: type: ALTERPARTITION_LOCATION
+PREHOOK: Input: default@tbl2
+PREHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERPARTITION_LOCATION
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+PREHOOK: query: select * from tbl2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: query: select * from tbl2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+2 2 2018-02-01
+PREHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl2
+POSTHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl2
+# col_name data_type comment
+index int
+value int
+
+# Partition Information
+# col_name data_type comment
+created_date string
+
+# Detailed Partition Information
+Partition Value: [2018-02-01]
+Database: default
+Table: tbl2
+#### A masked pattern was here ####
+Partition Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"index\":\"true\",\"value\":\"true\"}}
+#### A masked pattern was here ####
+ numFiles 0
+ numRows 0
+ rawDataSize 0
+ totalSize 0
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl2@created_date=2018-02-01
+POSTHOOK: query: insert into tbl2 partition(created_date='2018-02-01') VALUES (1, 1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl2@created_date=2018-02-01
+POSTHOOK: Lineage: tbl2 PARTITION(created_date=2018-02-01).index SCRIPT []
+POSTHOOK: Lineage: tbl2 PARTITION(created_date=2018-02-01).value SCRIPT []
+PREHOOK: query: select * from tbl2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl2
+PREHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+POSTHOOK: query: select * from tbl2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl2
+POSTHOOK: Input: default@tbl2@created_date=2018-02-01
+#### A masked pattern was here ####
+2 2 2018-02-01
+1 1 2018-02-01
+PREHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl2
+POSTHOOK: query: describe formatted tbl2 partition(created_date='2018-02-01')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl2
+# col_name data_type comment
+index int
+value int
+
+# Partition Information
+# col_name data_type comment
+created_date string
+
+# Detailed Partition Information
+Partition Value: [2018-02-01]
+Database: default
+Table: tbl2
+#### A masked pattern was here ####
+Partition Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"index\":\"true\",\"value\":\"true\"}}
+#### A masked pattern was here ####
+ numFiles 2
+ numRows 1
+ rawDataSize 3
+ totalSize 8
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1