You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2022/03/22 13:45:56 UTC
[hadoop] branch branch-3.3 updated: HADOOP-13704. Optimized S3A getContentSummary()
This is an automated email from the ASF dual-hosted git repository.
stevel pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new 105e0db HADOOP-13704. Optimized S3A getContentSummary()
105e0db is described below
commit 105e0dbd92406cc6293823f664c359219130905f
Author: Steve Loughran <st...@cloudera.com>
AuthorDate: Tue Mar 22 13:20:37 2022 +0000
HADOOP-13704. Optimized S3A getContentSummary()
Optimize the scan for s3 by performing a deep tree listing,
inferring directory counts from the paths returned.
Contributed by Ahmar Suhail.
Change-Id: I26ffa8c6f65fd11c68a88d6e2243b0eac6ffd024
---
.../src/site/markdown/filesystem/filesystem.md | 20 ++++
.../AbstractContractContentSummaryTest.java | 65 +++++++++++++
.../localfs/TestLocalFSContractContentSummary.java | 31 ++++++
.../org/apache/hadoop/fs/s3a/S3AFileSystem.java | 6 +-
.../apache/hadoop/fs/s3a/S3ObjectAttributes.java | 2 +-
.../fs/s3a/impl/GetContentSummaryOperation.java | 104 +++++++++++++--------
.../s3a/ITestS3AContractContentSummary.java | 70 ++++++++++++++
.../s3a/performance/ITestS3AMiscOperationCost.java | 4 +-
.../fs/s3a/scale/ITestS3ADirectoryPerformance.java | 33 +++++++
9 files changed, 289 insertions(+), 46 deletions(-)
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index 0e01aa1..4517bd8 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -453,6 +453,26 @@ The function `getLocatedFileStatus(FS, d)` is as defined in
The atomicity and consistency constraints are as for
`listStatus(Path, PathFilter)`.
+
+### `ContentSummary getContentSummary(Path path)`
+
+Given a path return its content summary.
+
+`getContentSummary()` first checks if the given path is a file and if yes, it returns 0 for directory count
+and 1 for file count.
+
+#### Preconditions
+
+ exists(FS, path) else raise FileNotFoundException
+
+#### Postconditions
+
+Returns a `ContentSummary` object with information such as directory count
+and file count for a given path.
+
+The atomicity and consistency constraints are as for
+`listStatus(Path, PathFilter)`.
+
### `BlockLocation[] getFileBlockLocations(FileStatus f, int s, int l)`
#### Preconditions
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java
new file mode 100644
index 0000000..5e5c917
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract;
+
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import java.io.FileNotFoundException;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+public abstract class AbstractContractContentSummaryTest extends AbstractFSContractTestBase {
+
+ @Test
+ public void testGetContentSummary() throws Throwable {
+ FileSystem fs = getFileSystem();
+
+ Path parent = path("parent");
+ Path nested = path(parent + "/a/b/c");
+ Path filePath = path(nested + "file.txt");
+
+ fs.mkdirs(parent);
+ fs.mkdirs(nested);
+ touch(getFileSystem(), filePath);
+
+ ContentSummary summary = fs.getContentSummary(parent);
+
+ Assertions.assertThat(summary.getDirectoryCount()).as("Summary " + summary).isEqualTo(4);
+
+ Assertions.assertThat(summary.getFileCount()).as("Summary " + summary).isEqualTo(1);
+ }
+
+ @Test
+ public void testGetContentSummaryIncorrectPath() throws Throwable {
+ FileSystem fs = getFileSystem();
+
+ Path parent = path("parent");
+ Path nested = path(parent + "/a");
+
+ fs.mkdirs(parent);
+
+ intercept(FileNotFoundException.class, () -> fs.getContentSummary(nested));
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java
new file mode 100644
index 0000000..7555cf8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract.localfs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.contract.AbstractContractContentSummaryTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+
+public class TestLocalFSContractContentSummary extends AbstractContractContentSummaryTest {
+
+ @Override
+ protected AbstractFSContract createContract(Configuration conf) {
+ return new LocalFSContract(conf);
+ }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index a977fb3..565dc82 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -3261,9 +3261,9 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
}
@Override
- public RemoteIterator<S3AFileStatus> listStatusIterator(final Path path)
- throws IOException {
- return S3AFileSystem.this.innerListStatus(path);
+ public RemoteIterator<S3ALocatedFileStatus> listFilesIterator(final Path path,
+ final boolean recursive) throws IOException {
+ return S3AFileSystem.this.innerListFiles(path, recursive, Listing.ACCEPT_ALL_BUT_S3N, null);
}
}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
index 5a8dfc7..275b207 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
@@ -66,7 +66,7 @@ public class S3ObjectAttributes {
/**
* Construct from the result of a copy and those parameters
* which aren't included in an AWS SDK response.
- * @param path
+ * @param path path
* @param copyResult copy result.
* @param serverSideEncryptionAlgorithm current encryption algorithm
* @param serverSideEncryptionKey any server side encryption key?
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java
index 23631c6..248bffb 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.fs.s3a.impl;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.util.HashSet;
import java.util.Set;
import org.slf4j.Logger;
@@ -34,22 +35,15 @@ import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.statistics.IOStatistics;
import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.s3a.S3ALocatedFileStatus;
import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
/**
* GetContentSummary operation.
- * This is based on {@code FileSystem.get#getContentSummary};
- * its still doing sequential treewalk with the efficiency
- * issues.
*
- * Changes:
- * 1. On the recursive calls there
- * is no probe to see if the path is a file: we know the
- * recursion only happens with a dir.
- * 2. If a subdirectory is not found during the walk, that
- * does not trigger an error. The directory is clearly
- * not part of the content any more.
+ * It is optimized for s3 and performs a deep tree listing,
+ * inferring directory counts from the paths returned.
*
* The Operation serves up IOStatistics; this counts
* the cost of all the list operations, but not the
@@ -122,9 +116,7 @@ public class GetContentSummaryOperation extends
/**
* Return the {@link ContentSummary} of a given directory.
- * This is a recursive operation (as the original is);
- * it'd be more efficient of stack and heap if it managed its
- * own stack.
+ *
* @param dir dir to scan
* @throws FileNotFoundException if the path does not resolve
* @throws IOException IO failure
@@ -133,34 +125,65 @@ public class GetContentSummaryOperation extends
* @throws IOException failure
*/
public ContentSummary getDirSummary(Path dir) throws IOException {
+
long totalLength = 0;
long fileCount = 0;
long dirCount = 1;
- final RemoteIterator<S3AFileStatus> it
- = callbacks.listStatusIterator(dir);
+
+ RemoteIterator<S3ALocatedFileStatus> it = callbacks.listFilesIterator(dir, true);
+
+ Set<Path> dirSet = new HashSet<>();
+ Set<Path> pathsTraversed = new HashSet<>();
while (it.hasNext()) {
- final S3AFileStatus s = it.next();
- if (s.isDirectory()) {
- try {
- ContentSummary c = getDirSummary(s.getPath());
- totalLength += c.getLength();
- fileCount += c.getFileCount();
- dirCount += c.getDirectoryCount();
- } catch (FileNotFoundException ignored) {
- // path was deleted during the scan; exclude from
- // summary.
- }
- } else {
- totalLength += s.getLen();
+ S3ALocatedFileStatus fileStatus = it.next();
+ Path filePath = fileStatus.getPath();
+
+ if (fileStatus.isDirectory() && !filePath.equals(dir)) {
+ dirSet.add(filePath);
+ buildDirectorySet(dirSet, pathsTraversed, dir, filePath.getParent());
+ } else if (!fileStatus.isDirectory()) {
fileCount += 1;
+ totalLength += fileStatus.getLen();
+ buildDirectorySet(dirSet, pathsTraversed, dir, filePath.getParent());
}
+
}
+
// Add the list's IOStatistics
iostatistics.aggregate(retrieveIOStatistics(it));
+
return new ContentSummary.Builder().length(totalLength).
- fileCount(fileCount).directoryCount(dirCount).
- spaceConsumed(totalLength).build();
+ fileCount(fileCount).directoryCount(dirCount + dirSet.size()).
+ spaceConsumed(totalLength).build();
+ }
+
+ /***
+ * This method builds the set of all directories found under the base path. We need to do this
+ * because if the directory structure /a/b/c was created with a single mkdirs() call, it is
+ * stored as 1 object in S3 and the list files iterator will only return a single entry /a/b/c.
+ *
+ * We keep track of paths traversed so far to prevent duplication of work. For eg, if we had
+ * a/b/c/file-1.txt and /a/b/c/file-2.txt, we will only recurse over the complete path once
+ * and won't have to do anything for file-2.txt.
+ *
+ * @param dirSet Set of all directories found in the path
+ * @param pathsTraversed Set of all paths traversed so far
+ * @param basePath Path of directory to scan
+ * @param parentPath Parent path of the current file/directory in the iterator
+ */
+ private void buildDirectorySet(Set<Path> dirSet, Set<Path> pathsTraversed, Path basePath,
+ Path parentPath) {
+
+ if (parentPath == null || pathsTraversed.contains(parentPath) || parentPath.equals(basePath)) {
+ return;
+ }
+
+ dirSet.add(parentPath);
+
+ buildDirectorySet(dirSet, pathsTraversed, basePath, parentPath.getParent());
+
+ pathsTraversed.add(parentPath);
}
/**
@@ -186,23 +209,24 @@ public class GetContentSummaryOperation extends
/**
* Get the status of a path.
- * @param path path to probe.
+ *
+ * @param path path to probe.
* @param probes probes to exec
* @return the status
* @throws IOException failure
*/
@Retries.RetryTranslated
- S3AFileStatus probePathStatus(Path path,
- Set<StatusProbeEnum> probes) throws IOException;
-
- /**
- * Incremental list of all entries in a directory.
- * @param path path of dir
- * @return an iterator
+ S3AFileStatus probePathStatus(Path path, Set<StatusProbeEnum> probes) throws IOException;
+
+ /***
+ * List all entries under a path.
+ *
+ * @param path
+ * @param recursive if the subdirectories need to be traversed recursively
+ * @return an iterator over the listing.
* @throws IOException failure
*/
- RemoteIterator<S3AFileStatus> listStatusIterator(Path path)
+ RemoteIterator<S3ALocatedFileStatus> listFilesIterator(Path path, boolean recursive)
throws IOException;
-
}
}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java
new file mode 100644
index 0000000..ad83cfe
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract.s3a;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.contract.AbstractContractContentSummaryTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+
+public class ITestS3AContractContentSummary extends AbstractContractContentSummaryTest {
+
+ @Test
+ public void testGetContentSummaryDir() throws Throwable {
+ describe("getContentSummary on test dir with children");
+ S3AFileSystem fs = getFileSystem();
+ Path baseDir = methodPath();
+
+ // Nested folders created separately will return as separate objects in listFiles()
+ fs.mkdirs(new Path(baseDir, "a"));
+ fs.mkdirs(new Path(baseDir, "a/b"));
+ fs.mkdirs(new Path(baseDir, "a/b/a"));
+
+ // Will return as one object
+ fs.mkdirs(new Path(baseDir, "d/e/f"));
+
+ Path filePath = new Path(baseDir, "a/b/file");
+ touch(fs, filePath);
+
+ // look at path to see if it is a file
+ // it is not: so LIST
+ final ContentSummary summary = fs.getContentSummary(baseDir);
+
+ Assertions.assertThat(summary.getDirectoryCount()).as("Summary " + summary).isEqualTo(7);
+ Assertions.assertThat(summary.getFileCount()).as("Summary " + summary).isEqualTo(1);
+ }
+
+ @Override
+ protected AbstractFSContract createContract(Configuration conf) {
+ return new S3AContract(conf);
+ }
+
+ @Override
+ public S3AFileSystem getFileSystem() {
+ return (S3AFileSystem) super.getFileSystem();
+ }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java
index 75701b5..6a6baa0 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java
@@ -144,8 +144,8 @@ public class ITestS3AMiscOperationCost extends AbstractS3ACostTest {
with(INVOCATION_GET_CONTENT_SUMMARY, 1),
withAuditCount(1),
always(FILE_STATUS_FILE_PROBE // look at path to see if it is a file
- .plus(LIST_OPERATION) // it is not: so LIST
- .plus(LIST_OPERATION))); // and a LIST on the child dir
+ .plus(LIST_OPERATION))); // it is not: so LIST
+
Assertions.assertThat(summary.getDirectoryCount())
.as("Summary " + summary)
.isEqualTo(2);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java
index d87af3b..946e59e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.fs.s3a.scale;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
@@ -154,6 +155,38 @@ public class ITestS3ADirectoryPerformance extends S3AScaleTestBase {
listStatusCalls,
getFileStatusCalls);
+ describe("Get content summary for directory");
+
+ NanoTimer getContentSummaryTimer = new NanoTimer();
+
+ ContentSummary rootPathSummary = fs.getContentSummary(scaleTestDir);
+ ContentSummary testPathSummary = fs.getContentSummary(listDir);
+
+ getContentSummaryTimer.end("getContentSummary of %s", created);
+
+ // only two list operations should have taken place
+ print(LOG,
+ metadataRequests,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+ assertEquals(listRequests.toString(), 2, listRequests.diff());
+ reset(metadataRequests,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+
+ assertTrue("Root directory count should be > test path",
+ rootPathSummary.getDirectoryCount() > testPathSummary.getDirectoryCount());
+ assertTrue("Root file count should be >= to test path",
+ rootPathSummary.getFileCount() >= testPathSummary.getFileCount());
+ assertEquals("Incorrect directory count", created.getDirCount() + 1,
+ testPathSummary.getDirectoryCount());
+ assertEquals("Incorrect file count", created.getFileCount(),
+ testPathSummary.getFileCount());
+
} finally {
describe("deletion");
// deletion at the end of the run
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org