You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2020/04/09 17:24:14 UTC

[hadoop] branch branch-3.3 updated: HADOOP-16932. distcp copy calls getFileStatus() needlessly and can fail against S3 (#1936)

This is an automated email from the ASF dual-hosted git repository.

stevel pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new e4331a7  HADOOP-16932. distcp copy calls getFileStatus() needlessly and can fail against S3 (#1936)
e4331a7 is described below

commit e4331a73c955d08e046c13395b2f92613f25b1c8
Author: Steve Loughran <st...@cloudera.com>
AuthorDate: Tue Apr 7 17:55:55 2020 +0100

    HADOOP-16932. distcp copy calls getFileStatus() needlessly and can fail against S3 (#1936)
    
    Contributed by Steve Loughran.
    
    This strips out all the -p preservation options which have already been
    processed when uploading a file before deciding whether or not to query
    the far end for the status of the (existing/uploaded) file to see if any
    other attributes need changing.
    
    This will avoid 404 caching-related issues in S3, wherein a newly created
    file can have a 404 entry in the S3 load balancer's cache from the
    probes for the file's existence prior to the upload.
    
    It partially addresses a regression caused by HADOOP-8143,
    "Change distcp to have -pb on by default" that causes a resurfacing
    of HADOOP-13145, "In DistCp, prevent unnecessary getFileStatus call when
    not preserving metadata"
    
    Change-Id: Ibc25d19e92548e6165eb8397157ebf89446333f7
---
 .../org/apache/hadoop/tools/util/DistCpUtils.java  |   3 +
 .../apache/hadoop/tools/util/TestDistCpUtils.java  | 131 +++++++++++++++------
 2 files changed, 97 insertions(+), 37 deletions(-)

diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
index 73c49bb..9c1666b 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
@@ -199,6 +199,9 @@ public class DistCpUtils {
                               EnumSet<FileAttribute> attributes,
                               boolean preserveRawXattrs) throws IOException {
 
+    // strip out those attributes we don't need any more
+    attributes.remove(FileAttribute.BLOCKSIZE);
+    attributes.remove(FileAttribute.CHECKSUMTYPE);
     // If not preserving anything from FileStatus, don't bother fetching it.
     FileStatus targetFileStatus = attributes.isEmpty() ? null :
         targetFS.getFileStatus(path);
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java
index 4acb022..d76e227 100644
--- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java
@@ -45,6 +45,7 @@ import org.junit.Test;
 
 import com.google.common.collect.Lists;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.EnumSet;
@@ -66,6 +67,7 @@ import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -191,15 +193,95 @@ public class TestDistCpUtils {
 
     DistCpUtils.preserve(fs, dst, srcStatus, attributes, false);
 
-    CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
+    assertStatusEqual(fs, dst, srcStatus);
+  }
+
+  private void assertStatusEqual(final FileSystem fs,
+      final Path dst,
+      final CopyListingFileStatus srcStatus) throws IOException {
+    FileStatus destStatus = fs.getFileStatus(dst);
+    CopyListingFileStatus dstStatus = new CopyListingFileStatus(
+        destStatus);
+
+    String text = String.format("Source %s; dest %s: wrong ", srcStatus,
+        destStatus);
 
     // FileStatus.equals only compares path field, must explicitly compare all fields
-    Assert.assertTrue(srcStatus.getPermission().equals(dstStatus.getPermission()));
-    Assert.assertTrue(srcStatus.getOwner().equals(dstStatus.getOwner()));
-    Assert.assertTrue(srcStatus.getGroup().equals(dstStatus.getGroup()));
-    Assert.assertTrue(srcStatus.getAccessTime() == dstStatus.getAccessTime());
-    Assert.assertTrue(srcStatus.getModificationTime() == dstStatus.getModificationTime());
-    Assert.assertTrue(srcStatus.getReplication() == dstStatus.getReplication());
+    assertEquals(text + "permission",
+        srcStatus.getPermission(), dstStatus.getPermission());
+    assertEquals(text + "owner",
+        srcStatus.getOwner(), dstStatus.getOwner());
+    assertEquals(text + "group",
+        srcStatus.getGroup(), dstStatus.getGroup());
+    assertEquals(text + "accessTime",
+        srcStatus.getAccessTime(), dstStatus.getAccessTime());
+    assertEquals(text + "modificationTime",
+        srcStatus.getModificationTime(), dstStatus.getModificationTime());
+    assertEquals(text + "replication",
+        srcStatus.getReplication(), dstStatus.getReplication());
+  }
+
+  private void assertStatusNotEqual(final FileSystem fs,
+      final Path dst,
+      final CopyListingFileStatus srcStatus) throws IOException {
+    FileStatus destStatus = fs.getFileStatus(dst);
+    CopyListingFileStatus dstStatus = new CopyListingFileStatus(
+        destStatus);
+
+    String text = String.format("Source %s; dest %s: wrong ",
+        srcStatus, destStatus);
+    // FileStatus.equals only compares path field,
+    // must explicitly compare all fields
+    assertNotEquals(text + "permission",
+        srcStatus.getPermission(), dstStatus.getPermission());
+    assertNotEquals(text + "owner",
+        srcStatus.getOwner(), dstStatus.getOwner());
+    assertNotEquals(text + "group",
+        srcStatus.getGroup(), dstStatus.getGroup());
+    assertNotEquals(text + "accessTime",
+        srcStatus.getAccessTime(), dstStatus.getAccessTime());
+    assertNotEquals(text + "modificationTime",
+        srcStatus.getModificationTime(), dstStatus.getModificationTime());
+    assertNotEquals(text + "replication",
+        srcStatus.getReplication(), dstStatus.getReplication());
+  }
+
+
+  @Test
+  public void testSkipsNeedlessAttributes() throws Exception {
+    FileSystem fs = FileSystem.get(config);
+
+    // preserve replication, block size, user, group, permission,
+    // checksum type and timestamps
+
+    Path src = new Path("/tmp/testSkipsNeedlessAttributes/source");
+    Path dst = new Path("/tmp/testSkipsNeedlessAttributes/dest");
+
+    // there is no need to actually create a source file, just a file
+    // status of one
+    CopyListingFileStatus srcStatus = new CopyListingFileStatus(
+        new FileStatus(0, false, 1, 32, 0, src));
+
+    // if an attribute is needed, preserve will fail to find the file
+    EnumSet<FileAttribute> attrs = EnumSet.of(FileAttribute.ACL,
+        FileAttribute.GROUP,
+        FileAttribute.PERMISSION,
+        FileAttribute.TIMES,
+        FileAttribute.XATTR);
+    for (FileAttribute attr : attrs) {
+      intercept(FileNotFoundException.class, () ->
+          DistCpUtils.preserve(fs, dst, srcStatus,
+              EnumSet.of(attr),
+              false));
+    }
+
+    // but with the preservation flags only used
+    // in file creation, this does not happen
+    DistCpUtils.preserve(fs, dst, srcStatus,
+        EnumSet.of(
+            FileAttribute.BLOCKSIZE,
+            FileAttribute.CHECKSUMTYPE),
+        false);
   }
 
   @Test
@@ -258,16 +340,8 @@ public class TestDistCpUtils {
 
     // FileStatus.equals only compares path field, must explicitly compare all
     // fields
-    Assert.assertEquals("getPermission", srcStatus.getPermission(),
-        dstStatus.getPermission());
-    Assert.assertEquals("Owner", srcStatus.getOwner(), dstStatus.getOwner());
-    Assert.assertEquals("Group", srcStatus.getGroup(), dstStatus.getGroup());
-    Assert.assertEquals("AccessTime", srcStatus.getAccessTime(),
-        dstStatus.getAccessTime());
-    Assert.assertEquals("ModificationTime", srcStatus.getModificationTime(),
-        dstStatus.getModificationTime());
-    Assert.assertEquals("Replication", srcStatus.getReplication(),
-        dstStatus.getReplication());
+    assertStatusEqual(fs, dest, srcStatus);
+
     Assert.assertArrayEquals(en1.toArray(), dd2.toArray());
   }
 
@@ -486,12 +560,7 @@ public class TestDistCpUtils {
     CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
 
     // FileStatus.equals only compares path field, must explicitly compare all fields
-    Assert.assertFalse(srcStatus.getPermission().equals(dstStatus.getPermission()));
-    Assert.assertFalse(srcStatus.getOwner().equals(dstStatus.getOwner()));
-    Assert.assertFalse(srcStatus.getGroup().equals(dstStatus.getGroup()));
-    Assert.assertFalse(srcStatus.getAccessTime() == dstStatus.getAccessTime());
-    Assert.assertFalse(srcStatus.getModificationTime() == dstStatus.getModificationTime());
-    Assert.assertFalse(srcStatus.getReplication() == dstStatus.getReplication());
+    assertStatusNotEqual(fs, dst, srcStatus);
   }
 
   @Test
@@ -842,13 +911,7 @@ public class TestDistCpUtils {
 
     // FileStatus.equals only compares path field, must explicitly compare all fields
     // attributes of src -> f2 ? should be yes
-    CopyListingFileStatus f2Status = new CopyListingFileStatus(fs.getFileStatus(f2));
-    Assert.assertTrue(srcStatus.getPermission().equals(f2Status.getPermission()));
-    Assert.assertTrue(srcStatus.getOwner().equals(f2Status.getOwner()));
-    Assert.assertTrue(srcStatus.getGroup().equals(f2Status.getGroup()));
-    Assert.assertTrue(srcStatus.getAccessTime() == f2Status.getAccessTime());
-    Assert.assertTrue(srcStatus.getModificationTime() == f2Status.getModificationTime());
-    Assert.assertTrue(srcStatus.getReplication() == f2Status.getReplication());
+    assertStatusEqual(fs, f2, srcStatus);
 
     // attributes of src -> f1 ? should be no
     CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));
@@ -1047,13 +1110,7 @@ public class TestDistCpUtils {
 
     // FileStatus.equals only compares path field, must explicitly compare all fields
     // attributes of src -> f0 ? should be yes
-    CopyListingFileStatus f0Status = new CopyListingFileStatus(fs.getFileStatus(f0));
-    Assert.assertTrue(srcStatus.getPermission().equals(f0Status.getPermission()));
-    Assert.assertTrue(srcStatus.getOwner().equals(f0Status.getOwner()));
-    Assert.assertTrue(srcStatus.getGroup().equals(f0Status.getGroup()));
-    Assert.assertTrue(srcStatus.getAccessTime() == f0Status.getAccessTime());
-    Assert.assertTrue(srcStatus.getModificationTime() == f0Status.getModificationTime());
-    Assert.assertTrue(srcStatus.getReplication() == f0Status.getReplication());
+    assertStatusEqual(fs, f0, srcStatus);
 
     // attributes of src -> f1 ? should be no
     CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org