You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ms...@apache.org on 2021/07/28 04:59:25 UTC

[hadoop] branch trunk updated: HDFS-16145. CopyListing fails with FNF exception with snapshot diff. (#3234)

This is an automated email from the ASF dual-hosted git repository.

msingh pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new dac10fc  HDFS-16145. CopyListing fails with FNF exception with snapshot diff. (#3234)
dac10fc is described below

commit dac10fcc202ed6d1fe4bd852f57a6bbcbadd90fe
Author: bshashikant <sh...@apache.org>
AuthorDate: Wed Jul 28 10:29:00 2021 +0530

    HDFS-16145. CopyListing fails with FNF exception with snapshot diff. (#3234)
---
 .../java/org/apache/hadoop/tools/DistCpSync.java   |  64 +++++++-
 .../org/apache/hadoop/tools/TestDistCpSync.java    | 170 +++++++++++++++++++++
 2 files changed, 231 insertions(+), 3 deletions(-)

diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java
index 6cbc936..6cb273e 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.tools;
 
+import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
@@ -37,6 +38,7 @@ import java.util.Random;
 import java.util.EnumMap;
 import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.Collections;
 
 /**
  * This class provides the basic functionality to sync two FileSystems based on
@@ -59,6 +61,9 @@ class DistCpSync {
   //
   private EnumMap<SnapshotDiffReport.DiffType, List<DiffInfo>> diffMap;
   private DiffInfo[] renameDiffs;
+  // entries which are marked deleted because of rename to a excluded target
+  // path
+  private List<DiffInfo> deletedByExclusionDiffs;
   private CopyFilter copyFilter;
 
   DistCpSync(DistCpContext context, Configuration conf) {
@@ -68,6 +73,11 @@ class DistCpSync {
     this.copyFilter.initialize();
   }
 
+  @VisibleForTesting
+  public void setCopyFilter(CopyFilter copyFilter) {
+    this.copyFilter = copyFilter;
+  }
+
   private boolean isRdiff() {
     return context.shouldUseRdiff();
   }
@@ -222,7 +232,7 @@ class DistCpSync {
           SnapshotDiffReport.DiffType.values()) {
         diffMap.put(type, new ArrayList<DiffInfo>());
       }
-
+      deletedByExclusionDiffs = null;
       for (SnapshotDiffReport.DiffReportEntry entry : report.getDiffList()) {
         // If the entry is the snapshot root, usually a item like "M\t."
         // in the diff report. We don't need to handle it and cannot handle it,
@@ -250,8 +260,13 @@ class DistCpSync {
               list.add(new DiffInfo(source, target, dt));
             } else {
               list = diffMap.get(SnapshotDiffReport.DiffType.DELETE);
-              list.add(new DiffInfo(source, target,
-                      SnapshotDiffReport.DiffType.DELETE));
+              DiffInfo info = new DiffInfo(source, target,
+                  SnapshotDiffReport.DiffType.DELETE);
+              list.add(info);
+              if (deletedByExclusionDiffs == null) {
+                deletedByExclusionDiffs = new ArrayList<>();
+              }
+              deletedByExclusionDiffs.add(info);
             }
           } else if (copyFilter.shouldCopy(relativeTarget)) {
             list = diffMap.get(SnapshotDiffReport.DiffType.CREATE);
@@ -260,6 +275,9 @@ class DistCpSync {
           }
         }
       }
+      if (deletedByExclusionDiffs != null) {
+        Collections.sort(deletedByExclusionDiffs, DiffInfo.sourceComparator);
+      }
       return true;
     } catch (IOException e) {
       DistCp.LOG.warn("Failed to compute snapshot diff on " + ssDir, e);
@@ -516,6 +534,33 @@ class DistCpSync {
   }
 
   /**
+   * checks if a parent dir is marked deleted as a part of dir rename happening
+   * to a path which is excluded by the the filter.
+   * @return true if it's marked deleted
+   */
+  private boolean isParentOrSelfMarkedDeleted(DiffInfo diff,
+      List<DiffInfo> deletedDirDiffArray) {
+    for (DiffInfo item : deletedDirDiffArray) {
+      if (item.getSource().equals(diff.getSource())) {
+        // The same path string may appear in:
+        // 1. both deleted and modified snapshot diff entries.
+        // 2. both deleted and created snapshot diff entries.
+        // Case 1 is the about same file/directory, whereas case 2
+        // is about two different files/directories.
+        // We are finding case 1 here, thus we check against DiffType.MODIFY.
+        if (diff.getType() == SnapshotDiffReport.DiffType.MODIFY) {
+          return true;
+        }
+      } else if (isParentOf(item.getSource(), diff.getSource())) {
+        // If deleted entry is the parent of diff entry, then both MODIFY and
+        // CREATE diff entries should be handled.
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
    * For a given sourcePath, get its real path if it or its parent was renamed.
    *
    * For example, if we renamed dirX to dirY, and created dirY/fileX,
@@ -567,6 +612,19 @@ class DistCpSync {
           renameDiffsList.toArray(new DiffInfo[renameDiffsList.size()]);
       Arrays.sort(renameDiffArray, DiffInfo.sourceComparator);
       for (DiffInfo diff : modifyAndCreateDiffs) {
+        //  In cases, where files/dirs got created after a snapshot is taken
+        // and then the parent dir is moved to location which is excluded by
+        // the filters. For example, files/dirs created inside a dir in an
+        // encryption zone in HDFS. When the parent dir gets deleted, it will be
+        // moved to trash within which is inside the encryption zone itself.
+        // If the trash path gets excluded by filters , the dir will be marked
+        // for DELETE for the target location. All the subsequent creates should
+        // for such dirs should be ignored as well as the modify operation
+        // on the dir itself.
+        if (deletedByExclusionDiffs != null && isParentOrSelfMarkedDeleted(diff,
+            deletedByExclusionDiffs)) {
+          continue;
+        }
         DiffInfo renameItem = getRenameItem(diff, renameDiffArray);
         if (renameItem == null) {
           diff.setTarget(diff.getSource());
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java
index 220caae..61c7274 100644
--- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java
@@ -53,6 +53,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 public class TestDistCpSync {
   private MiniDFSCluster cluster;
@@ -292,6 +293,175 @@ public class TestDistCpSync {
     verifyCopy(dfs.getFileStatus(spath), dfs.getFileStatus(target), false);
   }
 
+  /**
+   * Test the basic functionality.
+   */
+  @Test
+  public void testSync1() throws Exception {
+    Path srcpath = new Path(source, "encz-mock");
+    dfs.mkdirs(srcpath);
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    enableAndCreateFirstSnapshot();
+
+    // before sync, make some further changes on source
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file1"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.delete(new Path(source, "encz-mock/datedir"), true);
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file2"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.createSnapshot(source, "s2");
+    Assert.assertTrue(dfs.exists(new Path(source, "encz-mock/datedir/file2")));
+
+    SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
+    System.out.println(report);
+
+    DistCpSync distCpSync = new DistCpSync(context, conf);
+
+    // do the sync
+    Assert.assertTrue(distCpSync.sync());
+    // make sure the source path has been updated to the snapshot path
+    final Path spath = new Path(source,
+        HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
+    Assert.assertEquals(spath, context.getSourcePaths().get(0));
+
+    // build copy listing
+    final Path listingPath = new Path("/tmp/META/fileList.seq");
+    CopyListing listing =
+        new SimpleCopyListing(conf, new Credentials(), distCpSync);
+    listing.buildListing(listingPath, context);
+
+    Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath);
+    CopyMapper copyMapper = new CopyMapper();
+    StubContext stubContext = new StubContext(conf, null, 0);
+    Mapper<Text, CopyListingFileStatus, Text, Text>.Context mapContext =
+        stubContext.getContext();
+    copyMapper.setup(mapContext);
+    for (Map.Entry<Text, CopyListingFileStatus> entry : copyListing
+        .entrySet()) {
+      copyMapper.map(entry.getKey(), entry.getValue(), mapContext);
+    }
+    Assert.assertTrue(dfs.exists(new Path(target, "encz-mock/datedir/file2")));
+    // verify the source and target now has the same structure
+    verifyCopy(dfs.getFileStatus(spath), dfs.getFileStatus(target), false);
+  }
+
+  /**
+   * Test the basic functionality.
+   */
+  @Test
+  public void testSyncNew() throws Exception {
+    Path srcpath = new Path(source, "encz-mock");
+    dfs.mkdirs(srcpath);
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    dfs.mkdirs(new Path(source, "trash"));
+    enableAndCreateFirstSnapshot();
+
+    // before sync, make some further changes on source
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file1"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.rename(new Path(source, "encz-mock/datedir"),
+        new Path(source, "trash"));
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file2"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.createSnapshot(source, "s2");
+    Assert.assertTrue(dfs.exists(new Path(source, "encz-mock/datedir/file2")));
+
+    SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
+    System.out.println(report);
+
+    DistCpSync distCpSync = new DistCpSync(context, conf);
+
+    // do the sync
+    Assert.assertTrue(distCpSync.sync());
+    // make sure the source path has been updated to the snapshot path
+    final Path spath = new Path(source,
+        HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
+    Assert.assertEquals(spath, context.getSourcePaths().get(0));
+
+    // build copy listing
+    final Path listingPath = new Path("/tmp/META/fileList.seq");
+    CopyListing listing =
+        new SimpleCopyListing(conf, new Credentials(), distCpSync);
+    listing.buildListing(listingPath, context);
+
+    Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath);
+    CopyMapper copyMapper = new CopyMapper();
+    StubContext stubContext = new StubContext(conf, null, 0);
+    Mapper<Text, CopyListingFileStatus, Text, Text>.Context mapContext =
+        stubContext.getContext();
+    copyMapper.setup(mapContext);
+    for (Map.Entry<Text, CopyListingFileStatus> entry : copyListing
+        .entrySet()) {
+      copyMapper.map(entry.getKey(), entry.getValue(), mapContext);
+    }
+    Assert.assertTrue(dfs.exists(new Path(target, "encz-mock/datedir/file2")));
+    Assert.assertTrue(dfs.exists(new Path(target, "trash/datedir/file1")));
+    // verify the source and target now has the same structure
+    verifyCopy(dfs.getFileStatus(spath), dfs.getFileStatus(target), false);
+  }
+
+  /**
+   * Test the basic functionality.
+   */
+  @Test
+  public void testSyncWithFilters() throws Exception {
+    Path srcpath = new Path(source, "encz-mock");
+    dfs.mkdirs(srcpath);
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    dfs.mkdirs(new Path(source, "trash"));
+    enableAndCreateFirstSnapshot();
+
+    // before sync, make some further changes on source
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file1"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.rename(new Path(source, "encz-mock/datedir"),
+        new Path(source, "trash"));
+    dfs.mkdirs(new Path(source, "encz-mock/datedir"));
+    DFSTestUtil.createFile(dfs, new Path(source, "encz-mock/datedir/file2"),
+        BLOCK_SIZE, DATA_NUM, 0);
+    dfs.createSnapshot(source, "s2");
+    Assert.assertTrue(dfs.exists(new Path(source, "encz-mock/datedir/file2")));
+
+    SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
+    System.out.println(report);
+    List<Pattern> filters = new ArrayList<>();
+    filters.add(Pattern.compile(".*trash.*"));
+    RegexCopyFilter regexCopyFilter = new RegexCopyFilter("fakeFile");
+    regexCopyFilter.setFilters(filters);
+
+    DistCpSync distCpSync = new DistCpSync(context, conf);
+    distCpSync.setCopyFilter(regexCopyFilter);
+
+    // do the sync
+    Assert.assertTrue(distCpSync.sync());
+    // make sure the source path has been updated to the snapshot path
+    final Path spath = new Path(source,
+        HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
+    Assert.assertEquals(spath, context.getSourcePaths().get(0));
+
+    // build copy listing
+    final Path listingPath = new Path("/tmp/META/fileList.seq");
+    CopyListing listing =
+        new SimpleCopyListing(conf, new Credentials(), distCpSync);
+    listing.buildListing(listingPath, context);
+
+    Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath);
+    CopyMapper copyMapper = new CopyMapper();
+    StubContext stubContext = new StubContext(conf, null, 0);
+    Mapper<Text, CopyListingFileStatus, Text, Text>.Context mapContext =
+        stubContext.getContext();
+    copyMapper.setup(mapContext);
+    for (Map.Entry<Text, CopyListingFileStatus> entry : copyListing
+        .entrySet()) {
+      copyMapper.map(entry.getKey(), entry.getValue(), mapContext);
+    }
+    Assert.assertTrue(dfs.exists(new Path(target, "encz-mock/datedir/file2")));
+    Assert.assertFalse(dfs.exists(new Path(target, "encz-mock/datedir/file1")));
+    Assert.assertFalse(dfs.exists(new Path(target, "trash/datedir/file1")));
+  }
+
   private Map<Text, CopyListingFileStatus> getListing(Path listingPath)
       throws Exception {
     SequenceFile.Reader reader = new SequenceFile.Reader(conf,

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org