You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2019/10/14 16:07:54 UTC

[hadoop] branch trunk updated: HADOOP-16635. S3A "directories only" scan still does a HEAD.

This is an automated email from the ASF dual-hosted git repository.

stevel pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 74e5018  HADOOP-16635. S3A "directories only" scan still does a HEAD.
74e5018 is described below

commit 74e5018d871bdf712b3ad0706150a37cb8efee5c
Author: Steve Loughran <st...@apache.org>
AuthorDate: Mon Oct 14 17:05:52 2019 +0100

    HADOOP-16635. S3A "directories only" scan still does a HEAD.
    
    Contributed by Steve Loughran.
    
    Change-Id: I5e41d7f721364c392e1f4344db83dfa8c5aa06ce
---
 .../org/apache/hadoop/fs/s3a/S3AFileSystem.java    |  91 ++++++++++----
 .../apache/hadoop/fs/s3a/impl/StatusProbeEnum.java |  16 +++
 .../hadoop/fs/s3a/ITestAuthoritativePath.java      |   8 +-
 .../hadoop/fs/s3a/ITestS3AFileOperationCost.java   | 140 ++++++++++++++++++++-
 .../org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java  |   6 +
 5 files changed, 229 insertions(+), 32 deletions(-)

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 26f16a7..ff2ba14 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -2610,7 +2610,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
    * @param f The path we want information from
    * @param needEmptyDirectoryFlag if true, implementation will calculate
    *        a TRUE or FALSE value for {@link S3AFileStatus#isEmptyDirectory()}
-   * @param probes probes to make
+   * @param probes probes to make.
    * @return a S3AFileStatus object
    * @throws FileNotFoundException when the path does not exist
    * @throws IOException on other problems.
@@ -2711,7 +2711,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
       // there was no entry in S3Guard
       // retrieve the data and update the metadata store in the process.
       return S3Guard.putAndReturn(metadataStore,
-          s3GetFileStatus(path, key, StatusProbeEnum.ALL, tombstones),
+          s3GetFileStatus(path, key, probes, tombstones),
           instrumentation,
           ttlTimeProvider);
     }
@@ -2719,31 +2719,70 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
 
   /**
    * Raw {@code getFileStatus} that talks direct to S3.
-   * Used to implement {@link #innerGetFileStatus(Path, boolean)},
+   * Used to implement {@link #innerGetFileStatus(Path, boolean, Set)},
    * and for direct management of empty directory blobs.
+   *
+   * Checks made, in order:
+   * <ol>
+   *   <li>
+   *     Head: look for an object at the given key, provided that
+   *     the key doesn't end in "/"
+   *   </li>
+   *   <li>
+   *     DirMarker: look for the directory marker -the key with a trailing /
+   *     if not passed in.
+   *     If an object was found with size 0 bytes, a directory status entry
+   *     is returned which declares that the directory is empty.
+   *   </li>
+   *    <li>
+   *     List: issue a LIST on the key (with / if needed), require one
+   *     entry to be found for the path to be considered a non-empty directory.
+   *   </li>
+   * </ol>
+   *
+   * Notes:
+   * <ul>
+   *   <li>
+   *     Objects ending in / which are not 0-bytes long are not treated as
+   *     directory markers, but instead as files.
+   *   </li>
+   *   <li>
+   *     There's ongoing discussions about whether a dir marker
+   *     should be interpreted as an empty dir.
+   *   </li>
+   *   <li>
+   *     The HEAD requests require the permissions to read an object,
+   *     including (we believe) the ability to decrypt the file.
+   *     At the very least, for SSE-C markers, you need the same key on
+   *     the client for the HEAD to work.
+   *   </li>
+   *   <li>
+   *     The List probe needs list permission; it is also more prone to
+   *     inconsistency, even on newly created files.
+   *   </li>
+   * </ul>
+   *
    * Retry policy: retry translated.
    * @param path Qualified path
    * @param key  Key string for the path
    * @param probes probes to make
    * @param tombstones tombstones to filter
    * @return Status
-   * @throws FileNotFoundException when the path does not exist
+   * @throws FileNotFoundException the supplied probes failed.
    * @throws IOException on other problems.
    */
+  @VisibleForTesting
   @Retries.RetryTranslated
-  private S3AFileStatus s3GetFileStatus(final Path path,
-      String key,
+  S3AFileStatus s3GetFileStatus(final Path path,
+      final String key,
       final Set<StatusProbeEnum> probes,
       final Set<Path> tombstones) throws IOException {
-    if (!key.isEmpty() && probes.contains(StatusProbeEnum.Head)) {
-      try {
-        ObjectMetadata meta = getObjectMetadata(key);
-
-        if (objectRepresentsDirectory(key, meta.getContentLength())) {
-          LOG.debug("Found exact file: fake directory");
-          return new S3AFileStatus(Tristate.TRUE, path, username);
-        } else {
-          LOG.debug("Found exact file: normal file");
+    if (!key.isEmpty()) {
+      if (probes.contains(StatusProbeEnum.Head) && !key.endsWith("/")) {
+        try {
+          // look for the simple file
+          ObjectMetadata meta = getObjectMetadata(key);
+          LOG.debug("Found exact file: normal file {}", key);
           return new S3AFileStatus(meta.getContentLength(),
               dateToLong(meta.getLastModified()),
               path,
@@ -2751,18 +2790,22 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
               username,
               meta.getETag(),
               meta.getVersionId());
-        }
-      } catch (AmazonServiceException e) {
-        if (e.getStatusCode() != SC_404) {
+        } catch (AmazonServiceException e) {
+          // if the response is a 404 error, it just means that there is
+          // no file at that path...the remaining checks will be needed.
+          if (e.getStatusCode() != SC_404) {
+            throw translateException("getFileStatus", path, e);
+          }
+        } catch (AmazonClientException e) {
           throw translateException("getFileStatus", path, e);
         }
-      } catch (AmazonClientException e) {
-        throw translateException("getFileStatus", path, e);
       }
 
+      // Either a normal file was not found or the probe was skipped.
+      // because the key ended in "/" or it was not in the set of probes.
       // Look for the dir marker
-      if (!key.endsWith("/") && probes.contains(StatusProbeEnum.DirMarker)) {
-        String newKey = key + "/";
+      if (probes.contains(StatusProbeEnum.DirMarker)) {
+        String newKey = maybeAddTrailingSlash(key);
         try {
           ObjectMetadata meta = getObjectMetadata(newKey);
 
@@ -2794,8 +2837,8 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
     // execute the list
     if (probes.contains(StatusProbeEnum.List)) {
       try {
-        key = maybeAddTrailingSlash(key);
-        S3ListRequest request = createListObjectsRequest(key, "/", 1);
+        String dirKey = maybeAddTrailingSlash(key);
+        S3ListRequest request = createListObjectsRequest(dirKey, "/", 1);
 
         S3ListResult objects = listObjects(request);
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StatusProbeEnum.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StatusProbeEnum.java
index ca2875c..f843b20 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StatusProbeEnum.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StatusProbeEnum.java
@@ -41,4 +41,20 @@ public enum StatusProbeEnum {
   public static final Set<StatusProbeEnum> DIRECTORIES =
       EnumSet.of(DirMarker, List);
 
+  /** We only want the HEAD or dir marker. */
+  public static final Set<StatusProbeEnum> HEAD_OR_DIR_MARKER =
+      EnumSet.of(Head, DirMarker);
+
+  /** We only want the HEAD. */
+  public static final Set<StatusProbeEnum> HEAD_ONLY =
+      EnumSet.of(Head);
+
+  /** We only want the dir marker. */
+  public static final Set<StatusProbeEnum> DIR_MARKER_ONLY =
+      EnumSet.of(DirMarker);
+
+  /** We only want the dir marker. */
+  public static final Set<StatusProbeEnum> LIST_ONLY =
+      EnumSet.of(List);
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestAuthoritativePath.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestAuthoritativePath.java
index c35a585..eb54c0e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestAuthoritativePath.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestAuthoritativePath.java
@@ -84,14 +84,18 @@ public class ITestAuthoritativePath extends AbstractS3ATestBase {
 
   private void cleanUpFS(S3AFileSystem fs) {
     // detach from the (shared) metadata store.
-    fs.setMetadataStore(new NullMetadataStore());
+    if (fs != null) {
+      fs.setMetadataStore(new NullMetadataStore());
+    }
 
     IOUtils.cleanupWithLogger(LOG, fs);
   }
 
   @Override
   public void teardown() throws Exception {
-    fullyAuthFS.delete(testRoot, true);
+    if (fullyAuthFS != null) {
+      fullyAuthFS.delete(testRoot, true);
+    }
 
     cleanUpFS(fullyAuthFS);
     cleanUpFS(rawFS);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
index c62176b..e2f7fea 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
@@ -18,23 +18,32 @@
 
 package org.apache.hadoop.fs.s3a;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.s3a.impl.StatusProbeEnum;
 
+import org.assertj.core.api.Assertions;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.net.URI;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.EnumSet;
 import java.util.UUID;
 import java.util.concurrent.Callable;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
 import static org.apache.hadoop.fs.s3a.Statistic.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
 import static org.apache.hadoop.test.GenericTestUtils.getTestDir;
@@ -43,7 +52,9 @@ import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 /**
  * Use metrics to assert about the cost of file status queries.
  * {@link S3AFileSystem#getFileStatus(Path)}.
+ * Parameterized on guarded vs raw.
  */
+@RunWith(Parameterized.class)
 public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
 
   private MetricDiff metadataRequests;
@@ -52,9 +63,48 @@ public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
   private static final Logger LOG =
       LoggerFactory.getLogger(ITestS3AFileOperationCost.class);
 
+  /**
+   * Parameterization.
+   */
+  @Parameterized.Parameters(name = "{0}")
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][]{
+        {"raw", false},
+        {"guarded", true}
+    });
+  }
+
+  private final String name;
+
+  private final boolean s3guard;
+
+  public ITestS3AFileOperationCost(final String name, final boolean s3guard) {
+    this.name = name;
+    this.s3guard = s3guard;
+  }
+
+  @Override
+  public Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    String bucketName = getTestBucketName(conf);
+    removeBucketOverrides(bucketName, conf,
+        S3_METADATA_STORE_IMPL);
+    if (!s3guard) {
+      // in a raw run remove all s3guard settings
+      removeBaseAndBucketOverrides(bucketName, conf,
+          S3_METADATA_STORE_IMPL);
+    }
+    disableFilesystemCaching(conf);
+    return conf;
+  }
   @Override
   public void setup() throws Exception {
     super.setup();
+    if (s3guard) {
+      // s3guard is required for those test runs where any of the
+      // guard options are set
+      assumeS3GuardState(true, getConfiguration());
+    }
     S3AFileSystem fs = getFileSystem();
     metadataRequests = new MetricDiff(fs, OBJECT_METADATA_REQUESTS);
     listRequests = new MetricDiff(fs, OBJECT_LIST_REQUESTS);
@@ -80,6 +130,19 @@ public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
     reset(metadataRequests, listRequests);
   }
 
+  /**
+   * Verify that the head and list calls match expectations,
+   * then reset the counters ready for the next operation.
+   * @param head expected HEAD count
+   * @param list expected LIST count
+   */
+  private void verifyOperationCount(int head, int list) {
+    metadataRequests.assertDiffEquals(head);
+    listRequests.assertDiffEquals(list);
+    metadataRequests.reset();
+    listRequests.reset();
+  }
+
   @Test
   public void testCostOfGetFileStatusOnEmptyDir() throws Throwable {
     describe("performing getFileStatus on an empty directory");
@@ -205,12 +268,6 @@ public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
         + "clean up activity");
     S3AFileSystem fs = getFileSystem();
 
-    // As this test uses the s3 metrics to count the number of fake directory
-    // operations, it depends on side effects happening internally. With
-    // metadata store enabled, it is brittle to change. We disable this test
-    // before the internal behavior w/ or w/o metadata store.
-//    assumeFalse(fs.hasMetadataStore());
-
     Path srcBaseDir = path("src");
     mkdirs(srcBaseDir);
     MetricDiff deleteRequests =
@@ -389,4 +446,75 @@ public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
       fs.delete(dest, false);
     }
   }
+
+  @Test
+  public void testDirProbes() throws Throwable {
+    describe("Test directory probe cost -raw only");
+    S3AFileSystem fs = getFileSystem();
+    assume("Unguarded FS only", !fs.hasMetadataStore());
+    String dir = "testEmptyDirHeadProbe";
+    Path emptydir = path(dir);
+    // Create the empty directory.
+    fs.mkdirs(emptydir);
+
+    // metrics and assertions.
+    resetMetricDiffs();
+
+    intercept(FileNotFoundException.class, () ->
+        fs.innerGetFileStatus(emptydir, false,
+            StatusProbeEnum.HEAD_ONLY));
+    verifyOperationCount(1, 0);
+
+    // a LIST will find it -but it doesn't consider it an empty dir.
+    S3AFileStatus status = fs.innerGetFileStatus(emptydir, true,
+        StatusProbeEnum.LIST_ONLY);
+    verifyOperationCount(0, 1);
+    Assertions.assertThat(status)
+        .describedAs("LIST output is not considered empty")
+        .matches(s -> !s.isEmptyDirectory().equals(Tristate.TRUE),  "is empty");
+
+    // finally, skip all probes and expect no operations toThere are
+    // take place
+    intercept(FileNotFoundException.class, () ->
+        fs.innerGetFileStatus(emptydir, false,
+            EnumSet.noneOf(StatusProbeEnum.class)));
+    verifyOperationCount(0, 0);
+
+    // now add a trailing slash to the key and use the
+    // deep internal s3GetFileStatus method call.
+    String emptyDirTrailingSlash = fs.pathToKey(emptydir.getParent())
+        + "/" + dir +  "/";
+    // A HEAD request does not probe for keys with a trailing /
+    intercept(FileNotFoundException.class, () ->
+        fs.s3GetFileStatus(emptydir, emptyDirTrailingSlash,
+            StatusProbeEnum.HEAD_ONLY, null));
+    verifyOperationCount(0, 0);
+
+    // but ask for a directory marker and you get the entry
+    status = fs.s3GetFileStatus(emptydir,
+        emptyDirTrailingSlash,
+        StatusProbeEnum.DIR_MARKER_ONLY, null);
+    verifyOperationCount(1, 0);
+    assertEquals(emptydir, status.getPath());
+  }
+
+  @Test
+  public void testCreateCost() throws Throwable {
+    describe("Test file creation cost -raw only");
+    S3AFileSystem fs = getFileSystem();
+    assume("Unguarded FS only", !fs.hasMetadataStore());
+    resetMetricDiffs();
+    Path testFile = path("testCreateCost");
+
+    // when overwrite is false, the path is checked for existence.
+    try (FSDataOutputStream out = fs.create(testFile, false)) {
+      verifyOperationCount(2, 1);
+    }
+
+    // but when true: only the directory checks take place.
+    try (FSDataOutputStream out = fs.create(testFile, true)) {
+      verifyOperationCount(1, 1);
+    }
+
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java
index fb539a8..b068f3d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java
@@ -96,6 +96,12 @@ public class ITestS3GuardTtl extends AbstractS3ATestBase {
     return configuration;
   }
 
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    Assume.assumeTrue(getFileSystem().hasMetadataStore());
+  }
+
   @Test
   public void testDirectoryListingAuthoritativeTtl() throws Exception {
     LOG.info("Authoritative mode: {}", authoritative);


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org