You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by dk...@apache.org on 2023/04/04 12:31:53 UTC

[hive] branch master updated: HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)

This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new c0aa89e7de1 HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)
c0aa89e7de1 is described below

commit c0aa89e7de1b249a62febb238f51264d947dac07
Author: Dayakar M <59...@users.noreply.github.com>
AuthorDate: Tue Apr 4 18:01:41 2023 +0530

    HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)
    
    Closes #4114
---
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java    | 57 ++++++++++++----------
 .../apache/hadoop/hive/ql/io/TestAcidUtils.java    | 39 +++++++++++++++
 .../hive/ql/io/orc/TestInputOutputFormat.java      |  4 +-
 3 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 50f642841f4..5174ef13721 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -67,7 +67,6 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.fs.RemoteIterator;
@@ -1538,32 +1537,36 @@ public class AcidUtils {
   public static Map<Path, HdfsDirSnapshot> getHdfsDirSnapshots(final FileSystem fs, final Path path)
       throws IOException {
     Map<Path, HdfsDirSnapshot> dirToSnapshots = new HashMap<>();
-    RemoteIterator<LocatedFileStatus> itr = FileUtils.listFiles(fs, path, true, acidHiddenFileFilter);
-    while (itr.hasNext()) {
-      FileStatus fStatus = itr.next();
-      Path fPath = fStatus.getPath();
-      if (fStatus.isDirectory() && acidTempDirFilter.accept(fPath)) {
-        addToSnapshot(dirToSnapshots, fPath);
-      } else {
-        Path parentDirPath = fPath.getParent();
-        if (acidTempDirFilter.accept(parentDirPath)) {
-          while (isChildOfDelta(parentDirPath, path)) {
-            // Some cases there are other directory layers between the delta and the datafiles
-            // (export-import mm table, insert with union all to mm table, skewed tables).
-            // But it does not matter for the AcidState, we just need the deltas and the data files
-            // So build the snapshot with the files inside the delta directory
-            parentDirPath = parentDirPath.getParent();
-          }
-          HdfsDirSnapshot dirSnapshot = addToSnapshot(dirToSnapshots, parentDirPath);
-          // We're not filtering out the metadata file and acid format file,
-          // as they represent parts of a valid snapshot
-          // We're not using the cached values downstream, but we can potentially optimize more in a follow-up task
-          if (fStatus.getPath().toString().contains(MetaDataFile.METADATA_FILE)) {
-            dirSnapshot.addMetadataFile(fStatus);
-          } else if (fStatus.getPath().toString().contains(OrcAcidVersion.ACID_FORMAT)) {
-            dirSnapshot.addOrcAcidFormatFile(fStatus);
-          } else {
-            dirSnapshot.addFile(fStatus);
+    Deque<RemoteIterator<FileStatus>> stack = new ArrayDeque<>();
+    stack.push(FileUtils.listStatusIterator(fs, path, acidHiddenFileFilter));
+    while (!stack.isEmpty()) {
+      RemoteIterator<FileStatus> itr = stack.pop();
+      while (itr.hasNext()) {
+        FileStatus fStatus = itr.next();
+        Path fPath = fStatus.getPath();
+        if (fStatus.isDirectory()) {
+          stack.push(FileUtils.listStatusIterator(fs, fPath, acidHiddenFileFilter));
+        } else {
+          Path parentDirPath = fPath.getParent();
+          if (acidTempDirFilter.accept(parentDirPath)) {
+            while (isChildOfDelta(parentDirPath, path)) {
+              // Some cases there are other directory layers between the delta and the datafiles
+              // (export-import mm table, insert with union all to mm table, skewed tables).
+              // But it does not matter for the AcidState, we just need the deltas and the data files
+              // So build the snapshot with the files inside the delta directory
+              parentDirPath = parentDirPath.getParent();
+            }
+            HdfsDirSnapshot dirSnapshot = addToSnapshot(dirToSnapshots, parentDirPath);
+            // We're not filtering out the metadata file and acid format file,
+            // as they represent parts of a valid snapshot
+            // We're not using the cached values downstream, but we can potentially optimize more in a follow-up task
+            if (fStatus.getPath().toString().contains(MetaDataFile.METADATA_FILE)) {
+              dirSnapshot.addMetadataFile(fStatus);
+            } else if (fStatus.getPath().toString().contains(OrcAcidVersion.ACID_FORMAT)) {
+              dirSnapshot.addOrcAcidFormatFile(fStatus);
+            } else {
+              dirSnapshot.addFile(fStatus);
+            }
           }
         }
       }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
index 070ad4b88ab..da2f66b7bc9 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
@@ -20,7 +20,9 @@ package org.apache.hadoop.hive.ql.io;
 import static org.apache.hadoop.hive.common.AcidConstants.SOFT_DELETE_TABLE;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.spy;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.BitSet;
 import java.util.HashMap;
@@ -30,6 +32,7 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.ValidCompactorWriteIdList;
 import org.apache.hadoop.hive.common.ValidReadTxnList;
@@ -48,6 +51,7 @@ import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId;
 import org.junit.Assert;
 import org.junit.Test;
+import org.mockito.Mockito;
 
 public class TestAcidUtils {
 
@@ -710,4 +714,39 @@ public class TestAcidUtils {
     parameters.remove(SOFT_DELETE_TABLE);
     Assert.assertFalse(AcidUtils.isTableSoftDeleteEnabled(table, conf));
   }
+
+  @Test
+  public void testShouldGetHDFSSnapShots() throws Exception {
+    MockFileSystem fs = new MockFileSystem(new HiveConf(),
+        new MockFile("mock:/tbl/part1/.hive-staging_dir/-ext-10002", 500, new byte[0]),
+        new MockFile("mock:/tbl/part2/.hive-staging_dir", 500, new byte[0]),
+        new MockFile("mock:/tbl/part1/_tmp_space.db", 500, new byte[0]),
+        new MockFile("mock:/tbl/part1/delta_1_1/bucket-0000-0000", 500, new byte[0]));
+    Path path = new MockPath(fs, "/tbl");
+
+    Map<Path, AcidUtils.HdfsDirSnapshot> hdfsDirSnapshots = AcidUtils.getHdfsDirSnapshots(fs, path);
+    assertEquals(1, hdfsDirSnapshots.size());
+    assertEquals("mock:/tbl/part1/delta_1_1", hdfsDirSnapshots.keySet().stream().findFirst().get().toString());
+  }
+
+  @Test
+  public void testShouldNotThrowFNFEWhenHiveStagingDirectoryIsRemovedWhileFetchingHDFSSnapshots() throws Exception {
+    MockFileSystem fs = new MockFileSystem(new HiveConf(),
+        new MockFile("mock:/tbl/part1/.hive-staging_dir/-ext-10002", 500, new byte[0]),
+        new MockFile("mock:/tbl/part2/.hive-staging_dir", 500, new byte[0]),
+        new MockFile("mock:/tbl/part1/_tmp_space.db", 500, new byte[0]),
+        new MockFile("mock:/tbl/part1/delta_1_1/bucket-0000-0000", 500, new byte[0]));
+    Path path = new MockPath(fs, "/tbl");
+    Path stageDir = new MockPath(fs, "mock:/tbl/part1/.hive-staging_dir");
+    FileSystem mockFs = spy(fs);
+    Mockito.doThrow(new FileNotFoundException("")).when(mockFs).listLocatedStatus(stageDir);
+    try {
+      Map<Path, AcidUtils.HdfsDirSnapshot> hdfsDirSnapshots = AcidUtils.getHdfsDirSnapshots(mockFs, path);
+      assertEquals(1, hdfsDirSnapshots.size());
+      assertEquals("mock:/tbl/part1/delta_1_1", hdfsDirSnapshots.keySet().stream().findFirst().get().toString());
+    }
+    catch (FileNotFoundException fnf) {
+      Assert.fail("Should not throw FileNotFoundException when a directory is removed while fetching HDFSSnapshots");
+    }
+  }
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index c98f7f408eb..87e7703b346 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -4285,7 +4285,7 @@ public class TestInputOutputFormat {
     result = splitsForStreamingAcidTable(files);
     files.clear();
     assertEquals(1000, result.get(0).getLength());
-    assertEquals(95, result.get(1).getLength());
+    assertEquals(15, result.get(1).getLength());
 
     // 1 incomplete delta with 2 complete and 1 incomplete blocks: (1000 + 1000 + 500/800)
     files.addAll(mockDeltaWithSideFileForStreaming("delta_0000021_0000030_0000", 2500, 2800));
@@ -4293,7 +4293,7 @@ public class TestInputOutputFormat {
     files.clear();
     assertEquals(1000, result.get(0).getLength());
     assertEquals(1000, result.get(1).getLength());
-    assertEquals(800, result.get(2).getLength());
+    assertEquals(500, result.get(2).getLength());
 
     // 1 complete delta but shorter flush_length - though I think this is almost impossible
     files.addAll(mockDeltaWithSideFileForStreaming("delta_0000021_0000030_0000", 1000, 450));