You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ek...@apache.org on 2018/03/29 22:21:32 UTC
hive git commit: HIVE-19029 - Load Data should prevent loading acid
files (Eugene Koifman, reviewed by Jason Dere)
Repository: hive
Updated Branches:
refs/heads/master 6fd240f6e -> 3e3292b6c
HIVE-19029 - Load Data should prevent loading acid files (Eugene Koifman, reviewed by Jason Dere)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/3e3292b6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/3e3292b6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/3e3292b6
Branch: refs/heads/master
Commit: 3e3292b6c6170671e9a2bf6501c12fff84af15a7
Parents: 6fd240f
Author: Eugene Koifman <ek...@apache.org>
Authored: Thu Mar 29 15:21:26 2018 -0700
Committer: Eugene Koifman <ek...@apache.org>
Committed: Thu Mar 29 15:21:26 2018 -0700
----------------------------------------------------------------------
.../org/apache/hadoop/hive/ql/ErrorMsg.java | 4 ++
.../org/apache/hadoop/hive/ql/io/AcidUtils.java | 3 ++
.../hive/ql/parse/LoadSemanticAnalyzer.java | 40 +++++++++++++++++++-
.../apache/hadoop/hive/ql/TestTxnLoadData.java | 24 +++++++++++-
4 files changed, 68 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
index 1faa50a..f3e40eb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
@@ -457,6 +457,10 @@ public enum ErrorMsg {
HIVE_GROUPING_SETS_SIZE_LIMIT(10411,
"Grouping sets size cannot be greater than 64"),
REBUILD_NO_MATERIALIZED_VIEW(10412, "Rebuild command only valid for materialized views"),
+ LOAD_DATA_ACID_FILE(10413,
+ "\"{0}\" was created created by Acid write - it cannot be loaded into anther Acid table",
+ true),
+
//========================== 20000 range starts here ========================//
http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 1828f0a..a9ebc90 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -1653,6 +1653,9 @@ public class AcidUtils {
//directory is empty or doesn't have any that could have been produced by load data
return false;
}
+ return isRawFormatFile(dataFile, fs);
+ }
+ public static boolean isRawFormatFile(Path dataFile, FileSystem fs) throws IOException {
try {
Reader reader = OrcFile.createReader(dataFile, OrcFile.readerOptions(fs.getConf()));
/*
http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
index d5aace0..e49089b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
@@ -150,7 +150,8 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
}
try {
- srcs = matchFilesOrDir(FileSystem.get(fromURI, conf), new Path(fromURI));
+ FileSystem fileSystem = FileSystem.get(fromURI, conf);
+ srcs = matchFilesOrDir(fileSystem, new Path(fromURI));
if (srcs == null || srcs.length == 0) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast,
"No files matching path " + fromURI));
@@ -162,6 +163,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
"source contains directory: " + oneSrc.getPath().toString()));
}
}
+ validateAcidFiles(table, srcs, fileSystem);
// Do another loop if table is bucketed
List<String> bucketCols = table.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
@@ -198,12 +200,24 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
if (bucketArray[bucketId]) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(
"Multiple files for same bucket : " + bucketId
- + ". Only 1 file per bucket allowed in single load command. To load multiple files for same bucket, use multiple statements for table "
+ + ". Only 1 file per bucket allowed in single load command. To load " +
+ "multiple files for same bucket, use multiple statements for table "
+ table.getFullyQualifiedName()));
}
bucketArray[bucketId] = true;
}
}
+ else {
+ /**
+ * for loading into un-bucketed acid table, files can be named arbitrarily but they will
+ * be renamed during load.
+ * {@link Hive#mvFile(HiveConf, FileSystem, Path, FileSystem, Path, boolean, boolean,
+ * boolean, int)}
+ * and
+ * {@link Hive#copyFiles(HiveConf, FileSystem, FileStatus[], FileSystem, Path, boolean,
+ * boolean, List, boolean)}
+ */
+ }
} catch (IOException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
@@ -213,6 +227,28 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
return Lists.newArrayList(srcs);
}
+ /**
+ * Safety check to make sure a file take from one acid table is not added into another acid table
+ * since the ROW__IDs embedded as part a write to one table won't make sense in different
+ * table/cluster.
+ */
+ private static void validateAcidFiles(Table table, FileStatus[] srcs, FileSystem fs)
+ throws SemanticException {
+ if(!AcidUtils.isFullAcidTable(table)) {
+ return;
+ }
+ try {
+ for (FileStatus oneSrc : srcs) {
+ if (!AcidUtils.MetaDataFile.isRawFormatFile(oneSrc.getPath(), fs)) {
+ throw new SemanticException(ErrorMsg.LOAD_DATA_ACID_FILE, oneSrc.getPath().toString());
+ }
+ }
+ }
+ catch(IOException ex) {
+ throw new SemanticException(ex);
+ }
+ }
+
@Override
public void analyzeInternal(ASTNode ast) throws SemanticException {
boolean isLocal = false;
http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
index 8a01de3..3710311 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
import org.junit.Assert;
import org.junit.Rule;
@@ -453,9 +454,30 @@ public class TestTxnLoadData extends TxnCommandsBaseForTests {
* which will currently make the query non-vectorizable. This means we can't check the file name
* for vectorized version of the test.
*/
- private void checkResult(String[][] expectedResult, String query, boolean isVectorized, String msg) throws Exception{
+ private void checkResult(String[][] expectedResult, String query, boolean isVectorized,
+ String msg) throws Exception{
List<String> rs = runStatementOnDriver(query);
checkExpected(rs, expectedResult, msg + (isVectorized ? " vect" : ""), LOG, !isVectorized);
assertVectorized(isVectorized, query);
}
+ @Test
+ public void testLoadAcidFile() throws Exception {
+ MetastoreConf.setBoolVar(hiveConf, MetastoreConf.ConfVars.CREATE_TABLES_AS_ACID, true);
+ runStatementOnDriver("drop table if exists T");
+ runStatementOnDriver("drop table if exists T2");
+ runStatementOnDriver(
+ "create table T (a int, b int) stored as orc");
+ //This is just a simple way to generate test data
+ runStatementOnDriver("create table T2(a int, b int) stored as orc");
+ runStatementOnDriver("insert into T values(1,2)");
+ List<String> rs = runStatementOnDriver("select INPUT__FILE__NAME from T");
+ Assert.assertEquals(1, rs.size());
+ Assert.assertTrue("Unexpcted file name", rs.get(0)
+ .endsWith("t/delta_0000001_0000001_0000/bucket_00000"));
+ //T2 is an acid table so this should fail
+ CommandProcessorResponse cpr = runStatementOnDriverNegative(
+ "load data local inpath '" + rs.get(0) + "' into table T2");
+ Assert.assertEquals("Unexpected error code",
+ ErrorMsg.LOAD_DATA_ACID_FILE.getErrorCode(), cpr.getErrorCode());
+ }
}