You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jb...@apache.org on 2020/11/13 21:24:43 UTC
[hadoop] branch branch-3.3 updated: HADOOP-17362. reduce RPC calls
doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
This is an automated email from the ASF dual-hosted git repository.
jbrennan pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new 75ca0c0 HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
75ca0c0 is described below
commit 75ca0c0f23ab5f91b5f4e418ca29322a2f1fad8b
Author: Ahmed Hussein <50...@users.noreply.github.com>
AuthorDate: Fri Nov 13 14:22:35 2020 -0600
HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
(cherry picked from commit ebe1d1fbf7a7549078e5a468080513db09b6416f)
---
.../java/org/apache/hadoop/fs/HarFileSystem.java | 72 ++++++++++------------
.../org/apache/hadoop/fs/TestHarFileSystem.java | 1 -
.../apache/hadoop/fs/TestHarFileSystemBasics.java | 5 +-
3 files changed, 38 insertions(+), 40 deletions(-)
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
index 02c0916..7e12d0a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
@@ -35,6 +35,7 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
@@ -513,41 +514,22 @@ public class HarFileSystem extends FileSystem {
if (!parentString.endsWith(Path.SEPARATOR)){
parentString += Path.SEPARATOR;
}
- Path harPath = new Path(parentString);
- int harlen = harPath.depth();
- final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
-
- for (HarStatus hstatus : metadata.archive.values()) {
- String child = hstatus.getName();
- if ((child.startsWith(parentString))) {
- Path thisPath = new Path(child);
- if (thisPath.depth() == harlen + 1) {
- statuses.add(toFileStatus(hstatus, cache));
- }
- }
+
+ for (String child: parent.children) {
+ Path p = new Path(parentString + child);
+ statuses.add(toFileStatus(metadata.archive.get(p)));
}
}
/**
* Combine the status stored in the index and the underlying status.
* @param h status stored in the index
- * @param cache caching the underlying file statuses
* @return the combined file status
* @throws IOException
*/
- private FileStatus toFileStatus(HarStatus h,
- Map<String, FileStatus> cache) throws IOException {
- FileStatus underlying = null;
- if (cache != null) {
- underlying = cache.get(h.partName);
- }
- if (underlying == null) {
- final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
- underlying = fs.getFileStatus(p);
- if (cache != null) {
- cache.put(h.partName, underlying);
- }
- }
+ private FileStatus toFileStatus(HarStatus h) throws IOException {
+ final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName);
+ FileStatus underlying = metadata.getPartFileStatus(p);
long modTime = 0;
int version = metadata.getVersion();
@@ -658,7 +640,7 @@ public class HarFileSystem extends FileSystem {
@Override
public FileStatus getFileStatus(Path f) throws IOException {
HarStatus hstatus = getFileHarStatus(f);
- return toFileStatus(hstatus, null);
+ return toFileStatus(hstatus);
}
private HarStatus getFileHarStatus(Path f) throws IOException {
@@ -815,7 +797,7 @@ public class HarFileSystem extends FileSystem {
if (hstatus.isDir()) {
fileStatusesInIndex(hstatus, statuses);
} else {
- statuses.add(toFileStatus(hstatus, null));
+ statuses.add(toFileStatus(hstatus));
}
return statuses.toArray(new FileStatus[statuses.size()]);
@@ -1143,7 +1125,8 @@ public class HarFileSystem extends FileSystem {
List<Store> stores = new ArrayList<Store>();
Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
- private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
+ // keys are always the internal har path.
+ private Map<Path, FileStatus> partFileStatuses = new ConcurrentHashMap<>();
public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
this.fs = fs;
@@ -1151,16 +1134,23 @@ public class HarFileSystem extends FileSystem {
this.archiveIndexPath = archiveIndexPath;
}
- public FileStatus getPartFileStatus(Path partPath) throws IOException {
+ public FileStatus getPartFileStatus(Path path) throws IOException {
+ Path partPath = getPathInHar(path);
FileStatus status;
status = partFileStatuses.get(partPath);
if (status == null) {
- status = fs.getFileStatus(partPath);
+ status = fs.getFileStatus(path);
partFileStatuses.put(partPath, status);
}
return status;
}
+ private void addPartFileStatuses(Path path) throws IOException {
+ for (FileStatus stat : fs.listStatus(path)) {
+ partFileStatuses.put(getPathInHar(stat.getPath()), stat);
+ }
+ }
+
public long getMasterIndexTimestamp() {
return masterIndexTimestamp;
}
@@ -1217,16 +1207,22 @@ public class HarFileSystem extends FileSystem {
try {
FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
archiveIndexTimestamp = archiveStat.getModificationTime();
- LineReader aLin;
+
+ // pre-populate part cache.
+ addPartFileStatuses(archiveIndexPath.getParent());
+ LineReader aLin = null;
// now start reading the real index file
+ long pos = -1;
for (Store s: stores) {
- read = 0;
- aIn.seek(s.begin);
- aLin = new LineReader(aIn, getConf());
- while (read + s.begin < s.end) {
- int tmp = aLin.readLine(line);
- read += tmp;
+ if (pos != s.begin) {
+ pos = s.begin;
+ aIn.seek(s.begin);
+ aLin = new LineReader(aIn, getConf());
+ }
+
+ while (pos < s.end) {
+ pos += aLin.readLine(line);
String lineFeed = line.toString();
String[] parsed = lineFeed.split(" ");
parsed[0] = decodeFileName(parsed[0]);
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
index 8050ce6..711ab94 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
@@ -41,7 +41,6 @@ import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.concurrent.CompletableFuture;
import static org.apache.hadoop.fs.Options.ChecksumOpt;
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
index c58e731..6415df6 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
@@ -33,7 +33,10 @@ import java.net.URI;
import java.util.HashSet;
import java.util.Set;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
/**
* This test class checks basic operations with {@link HarFileSystem} including
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org