You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by dk...@apache.org on 2023/04/04 12:31:53 UTC
[hive] branch master updated: HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)
This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c0aa89e7de1 HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)
c0aa89e7de1 is described below
commit c0aa89e7de1b249a62febb238f51264d947dac07
Author: Dayakar M <59...@users.noreply.github.com>
AuthorDate: Tue Apr 4 18:01:41 2023 +0530
HIVE-27135: AcidUtils#getHdfsDirSnapshots() throws FNFE when a directory is removed in HDFS (Dayakar M, reviewed by Denys Kuzmenko, Laszlo Vegh, Sourabh Badhya)
Closes #4114
---
.../org/apache/hadoop/hive/ql/io/AcidUtils.java | 57 ++++++++++++----------
.../apache/hadoop/hive/ql/io/TestAcidUtils.java | 39 +++++++++++++++
.../hive/ql/io/orc/TestInputOutputFormat.java | 4 +-
3 files changed, 71 insertions(+), 29 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 50f642841f4..5174ef13721 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -67,7 +67,6 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
@@ -1538,32 +1537,36 @@ public class AcidUtils {
public static Map<Path, HdfsDirSnapshot> getHdfsDirSnapshots(final FileSystem fs, final Path path)
throws IOException {
Map<Path, HdfsDirSnapshot> dirToSnapshots = new HashMap<>();
- RemoteIterator<LocatedFileStatus> itr = FileUtils.listFiles(fs, path, true, acidHiddenFileFilter);
- while (itr.hasNext()) {
- FileStatus fStatus = itr.next();
- Path fPath = fStatus.getPath();
- if (fStatus.isDirectory() && acidTempDirFilter.accept(fPath)) {
- addToSnapshot(dirToSnapshots, fPath);
- } else {
- Path parentDirPath = fPath.getParent();
- if (acidTempDirFilter.accept(parentDirPath)) {
- while (isChildOfDelta(parentDirPath, path)) {
- // Some cases there are other directory layers between the delta and the datafiles
- // (export-import mm table, insert with union all to mm table, skewed tables).
- // But it does not matter for the AcidState, we just need the deltas and the data files
- // So build the snapshot with the files inside the delta directory
- parentDirPath = parentDirPath.getParent();
- }
- HdfsDirSnapshot dirSnapshot = addToSnapshot(dirToSnapshots, parentDirPath);
- // We're not filtering out the metadata file and acid format file,
- // as they represent parts of a valid snapshot
- // We're not using the cached values downstream, but we can potentially optimize more in a follow-up task
- if (fStatus.getPath().toString().contains(MetaDataFile.METADATA_FILE)) {
- dirSnapshot.addMetadataFile(fStatus);
- } else if (fStatus.getPath().toString().contains(OrcAcidVersion.ACID_FORMAT)) {
- dirSnapshot.addOrcAcidFormatFile(fStatus);
- } else {
- dirSnapshot.addFile(fStatus);
+ Deque<RemoteIterator<FileStatus>> stack = new ArrayDeque<>();
+ stack.push(FileUtils.listStatusIterator(fs, path, acidHiddenFileFilter));
+ while (!stack.isEmpty()) {
+ RemoteIterator<FileStatus> itr = stack.pop();
+ while (itr.hasNext()) {
+ FileStatus fStatus = itr.next();
+ Path fPath = fStatus.getPath();
+ if (fStatus.isDirectory()) {
+ stack.push(FileUtils.listStatusIterator(fs, fPath, acidHiddenFileFilter));
+ } else {
+ Path parentDirPath = fPath.getParent();
+ if (acidTempDirFilter.accept(parentDirPath)) {
+ while (isChildOfDelta(parentDirPath, path)) {
+ // Some cases there are other directory layers between the delta and the datafiles
+ // (export-import mm table, insert with union all to mm table, skewed tables).
+ // But it does not matter for the AcidState, we just need the deltas and the data files
+ // So build the snapshot with the files inside the delta directory
+ parentDirPath = parentDirPath.getParent();
+ }
+ HdfsDirSnapshot dirSnapshot = addToSnapshot(dirToSnapshots, parentDirPath);
+ // We're not filtering out the metadata file and acid format file,
+ // as they represent parts of a valid snapshot
+ // We're not using the cached values downstream, but we can potentially optimize more in a follow-up task
+ if (fStatus.getPath().toString().contains(MetaDataFile.METADATA_FILE)) {
+ dirSnapshot.addMetadataFile(fStatus);
+ } else if (fStatus.getPath().toString().contains(OrcAcidVersion.ACID_FORMAT)) {
+ dirSnapshot.addOrcAcidFormatFile(fStatus);
+ } else {
+ dirSnapshot.addFile(fStatus);
+ }
}
}
}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
index 070ad4b88ab..da2f66b7bc9 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java
@@ -20,7 +20,9 @@ package org.apache.hadoop.hive.ql.io;
import static org.apache.hadoop.hive.common.AcidConstants.SOFT_DELETE_TABLE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.spy;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.BitSet;
import java.util.HashMap;
@@ -30,6 +32,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidCompactorWriteIdList;
import org.apache.hadoop.hive.common.ValidReadTxnList;
@@ -48,6 +51,7 @@ import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId;
import org.junit.Assert;
import org.junit.Test;
+import org.mockito.Mockito;
public class TestAcidUtils {
@@ -710,4 +714,39 @@ public class TestAcidUtils {
parameters.remove(SOFT_DELETE_TABLE);
Assert.assertFalse(AcidUtils.isTableSoftDeleteEnabled(table, conf));
}
+
+ @Test
+ public void testShouldGetHDFSSnapShots() throws Exception {
+ MockFileSystem fs = new MockFileSystem(new HiveConf(),
+ new MockFile("mock:/tbl/part1/.hive-staging_dir/-ext-10002", 500, new byte[0]),
+ new MockFile("mock:/tbl/part2/.hive-staging_dir", 500, new byte[0]),
+ new MockFile("mock:/tbl/part1/_tmp_space.db", 500, new byte[0]),
+ new MockFile("mock:/tbl/part1/delta_1_1/bucket-0000-0000", 500, new byte[0]));
+ Path path = new MockPath(fs, "/tbl");
+
+ Map<Path, AcidUtils.HdfsDirSnapshot> hdfsDirSnapshots = AcidUtils.getHdfsDirSnapshots(fs, path);
+ assertEquals(1, hdfsDirSnapshots.size());
+ assertEquals("mock:/tbl/part1/delta_1_1", hdfsDirSnapshots.keySet().stream().findFirst().get().toString());
+ }
+
+ @Test
+ public void testShouldNotThrowFNFEWhenHiveStagingDirectoryIsRemovedWhileFetchingHDFSSnapshots() throws Exception {
+ MockFileSystem fs = new MockFileSystem(new HiveConf(),
+ new MockFile("mock:/tbl/part1/.hive-staging_dir/-ext-10002", 500, new byte[0]),
+ new MockFile("mock:/tbl/part2/.hive-staging_dir", 500, new byte[0]),
+ new MockFile("mock:/tbl/part1/_tmp_space.db", 500, new byte[0]),
+ new MockFile("mock:/tbl/part1/delta_1_1/bucket-0000-0000", 500, new byte[0]));
+ Path path = new MockPath(fs, "/tbl");
+ Path stageDir = new MockPath(fs, "mock:/tbl/part1/.hive-staging_dir");
+ FileSystem mockFs = spy(fs);
+ Mockito.doThrow(new FileNotFoundException("")).when(mockFs).listLocatedStatus(stageDir);
+ try {
+ Map<Path, AcidUtils.HdfsDirSnapshot> hdfsDirSnapshots = AcidUtils.getHdfsDirSnapshots(mockFs, path);
+ assertEquals(1, hdfsDirSnapshots.size());
+ assertEquals("mock:/tbl/part1/delta_1_1", hdfsDirSnapshots.keySet().stream().findFirst().get().toString());
+ }
+ catch (FileNotFoundException fnf) {
+ Assert.fail("Should not throw FileNotFoundException when a directory is removed while fetching HDFSSnapshots");
+ }
+ }
}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index c98f7f408eb..87e7703b346 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -4285,7 +4285,7 @@ public class TestInputOutputFormat {
result = splitsForStreamingAcidTable(files);
files.clear();
assertEquals(1000, result.get(0).getLength());
- assertEquals(95, result.get(1).getLength());
+ assertEquals(15, result.get(1).getLength());
// 1 incomplete delta with 2 complete and 1 incomplete blocks: (1000 + 1000 + 500/800)
files.addAll(mockDeltaWithSideFileForStreaming("delta_0000021_0000030_0000", 2500, 2800));
@@ -4293,7 +4293,7 @@ public class TestInputOutputFormat {
files.clear();
assertEquals(1000, result.get(0).getLength());
assertEquals(1000, result.get(1).getLength());
- assertEquals(800, result.get(2).getLength());
+ assertEquals(500, result.get(2).getLength());
// 1 complete delta but shorter flush_length - though I think this is almost impossible
files.addAll(mockDeltaWithSideFileForStreaming("delta_0000021_0000030_0000", 1000, 450));