You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ek...@apache.org on 2018/03/29 22:21:32 UTC

hive git commit: HIVE-19029 - Load Data should prevent loading acid files (Eugene Koifman, reviewed by Jason Dere)

Repository: hive
Updated Branches:
  refs/heads/master 6fd240f6e -> 3e3292b6c


HIVE-19029 - Load Data should prevent loading acid files (Eugene Koifman, reviewed by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/3e3292b6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/3e3292b6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/3e3292b6

Branch: refs/heads/master
Commit: 3e3292b6c6170671e9a2bf6501c12fff84af15a7
Parents: 6fd240f
Author: Eugene Koifman <ek...@apache.org>
Authored: Thu Mar 29 15:21:26 2018 -0700
Committer: Eugene Koifman <ek...@apache.org>
Committed: Thu Mar 29 15:21:26 2018 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/ql/ErrorMsg.java     |  4 ++
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java |  3 ++
 .../hive/ql/parse/LoadSemanticAnalyzer.java     | 40 +++++++++++++++++++-
 .../apache/hadoop/hive/ql/TestTxnLoadData.java  | 24 +++++++++++-
 4 files changed, 68 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
index 1faa50a..f3e40eb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
@@ -457,6 +457,10 @@ public enum ErrorMsg {
   HIVE_GROUPING_SETS_SIZE_LIMIT(10411,
     "Grouping sets size cannot be greater than 64"),
   REBUILD_NO_MATERIALIZED_VIEW(10412, "Rebuild command only valid for materialized views"),
+  LOAD_DATA_ACID_FILE(10413,
+      "\"{0}\" was created created by Acid write - it cannot be loaded into anther Acid table",
+      true),
+
 
   //========================== 20000 range starts here ========================//
 

http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 1828f0a..a9ebc90 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -1653,6 +1653,9 @@ public class AcidUtils {
         //directory is empty or doesn't have any that could have been produced by load data
         return false;
       }
+      return isRawFormatFile(dataFile, fs);
+    }
+    public static boolean isRawFormatFile(Path dataFile, FileSystem fs) throws IOException {
       try {
         Reader reader = OrcFile.createReader(dataFile, OrcFile.readerOptions(fs.getConf()));
         /*

http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
index d5aace0..e49089b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
@@ -150,7 +150,8 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
     }
 
     try {
-      srcs = matchFilesOrDir(FileSystem.get(fromURI, conf), new Path(fromURI));
+      FileSystem fileSystem = FileSystem.get(fromURI, conf);
+      srcs = matchFilesOrDir(fileSystem, new Path(fromURI));
       if (srcs == null || srcs.length == 0) {
         throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast,
             "No files matching path " + fromURI));
@@ -162,6 +163,7 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
               "source contains directory: " + oneSrc.getPath().toString()));
         }
       }
+      validateAcidFiles(table, srcs, fileSystem);
       // Do another loop if table is bucketed
       List<String> bucketCols = table.getBucketCols();
       if (bucketCols != null && !bucketCols.isEmpty()) {
@@ -198,12 +200,24 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
           if (bucketArray[bucketId]) {
             throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(
                     "Multiple files for same bucket : " + bucketId
-                            + ". Only 1 file per bucket allowed in single load command. To load multiple files for same bucket, use multiple statements for table "
+                            + ". Only 1 file per bucket allowed in single load command. To load " +
+                        "multiple files for same bucket, use multiple statements for table "
             + table.getFullyQualifiedName()));
           }
           bucketArray[bucketId] = true;
         }
       }
+      else {
+        /**
+         * for loading into un-bucketed acid table, files can be named arbitrarily but they will
+         * be renamed during load.
+         * {@link Hive#mvFile(HiveConf, FileSystem, Path, FileSystem, Path, boolean, boolean,
+         * boolean, int)}
+         * and
+         * {@link Hive#copyFiles(HiveConf, FileSystem, FileStatus[], FileSystem, Path, boolean,
+         * boolean, List, boolean)}
+         */
+      }
     } catch (IOException e) {
       // Has to use full name to make sure it does not conflict with
       // org.apache.commons.lang.StringUtils
@@ -213,6 +227,28 @@ public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {
     return Lists.newArrayList(srcs);
   }
 
+  /**
+   * Safety check to make sure a file take from one acid table is not added into another acid table
+   * since the ROW__IDs embedded as part a write to one table won't make sense in different
+   * table/cluster.
+   */
+  private static void validateAcidFiles(Table table, FileStatus[] srcs, FileSystem fs)
+      throws SemanticException {
+    if(!AcidUtils.isFullAcidTable(table)) {
+      return;
+    }
+    try {
+      for (FileStatus oneSrc : srcs) {
+        if (!AcidUtils.MetaDataFile.isRawFormatFile(oneSrc.getPath(), fs)) {
+          throw new SemanticException(ErrorMsg.LOAD_DATA_ACID_FILE, oneSrc.getPath().toString());
+        }
+      }
+    }
+    catch(IOException ex) {
+      throw new SemanticException(ex);
+    }
+  }
+
   @Override
   public void analyzeInternal(ASTNode ast) throws SemanticException {
     boolean isLocal = false;

http://git-wip-us.apache.org/repos/asf/hive/blob/3e3292b6/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
index 8a01de3..3710311 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnLoadData.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -453,9 +454,30 @@ public class TestTxnLoadData extends TxnCommandsBaseForTests {
    * which will currently make the query non-vectorizable.  This means we can't check the file name
    * for vectorized version of the test.
    */
-  private void checkResult(String[][] expectedResult, String query, boolean isVectorized, String msg) throws Exception{
+  private void checkResult(String[][] expectedResult, String query, boolean isVectorized,
+      String msg) throws Exception{
     List<String> rs = runStatementOnDriver(query);
     checkExpected(rs, expectedResult, msg + (isVectorized ? " vect" : ""), LOG, !isVectorized);
     assertVectorized(isVectorized, query);
   }
+  @Test
+  public void testLoadAcidFile() throws Exception {
+    MetastoreConf.setBoolVar(hiveConf, MetastoreConf.ConfVars.CREATE_TABLES_AS_ACID, true);
+    runStatementOnDriver("drop table if exists T");
+    runStatementOnDriver("drop table if exists T2");
+    runStatementOnDriver(
+        "create table T (a int, b int) stored as orc");
+    //This is just a simple way to generate test data
+    runStatementOnDriver("create table T2(a int, b int) stored as orc");
+    runStatementOnDriver("insert into T values(1,2)");
+    List<String> rs = runStatementOnDriver("select INPUT__FILE__NAME from T");
+    Assert.assertEquals(1, rs.size());
+    Assert.assertTrue("Unexpcted file name", rs.get(0)
+        .endsWith("t/delta_0000001_0000001_0000/bucket_00000"));
+    //T2 is an acid table so this should fail
+    CommandProcessorResponse cpr = runStatementOnDriverNegative(
+        "load data local inpath '" + rs.get(0) + "' into table T2");
+    Assert.assertEquals("Unexpected error code",
+        ErrorMsg.LOAD_DATA_ACID_FILE.getErrorCode(), cpr.getErrorCode());
+  }
 }