You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jb...@apache.org on 2020/11/13 21:24:43 UTC

[hadoop] branch branch-3.3 updated: HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein

This is an automated email from the ASF dual-hosted git repository.

jbrennan pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new 75ca0c0  HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
75ca0c0 is described below

commit 75ca0c0f23ab5f91b5f4e418ca29322a2f1fad8b
Author: Ahmed Hussein <50...@users.noreply.github.com>
AuthorDate: Fri Nov 13 14:22:35 2020 -0600

    HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
    
    (cherry picked from commit ebe1d1fbf7a7549078e5a468080513db09b6416f)
---
 .../java/org/apache/hadoop/fs/HarFileSystem.java   | 72 ++++++++++------------
 .../org/apache/hadoop/fs/TestHarFileSystem.java    |  1 -
 .../apache/hadoop/fs/TestHarFileSystemBasics.java  |  5 +-
 3 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
index 02c0916..7e12d0a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
@@ -35,6 +35,7 @@ import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URLDecoder;
 import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
 
 import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
 
@@ -513,41 +514,22 @@ public class HarFileSystem extends FileSystem {
     if (!parentString.endsWith(Path.SEPARATOR)){
         parentString += Path.SEPARATOR;
     }
-    Path harPath = new Path(parentString);
-    int harlen = harPath.depth();
-    final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
-
-    for (HarStatus hstatus : metadata.archive.values()) {
-      String child = hstatus.getName();
-      if ((child.startsWith(parentString))) {
-        Path thisPath = new Path(child);
-        if (thisPath.depth() == harlen + 1) {
-          statuses.add(toFileStatus(hstatus, cache));
-        }
-      }
+
+    for (String child: parent.children) {
+      Path p = new Path(parentString + child);
+      statuses.add(toFileStatus(metadata.archive.get(p)));
     }
   }
 
   /**
    * Combine the status stored in the index and the underlying status. 
    * @param h status stored in the index
-   * @param cache caching the underlying file statuses
    * @return the combined file status
    * @throws IOException
    */
-  private FileStatus toFileStatus(HarStatus h,
-      Map<String, FileStatus> cache) throws IOException {
-    FileStatus underlying = null;
-    if (cache != null) {
-      underlying = cache.get(h.partName);
-    }
-    if (underlying == null) {
-      final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
-      underlying = fs.getFileStatus(p);
-      if (cache != null) {
-        cache.put(h.partName, underlying);
-      }
-    }
+  private FileStatus toFileStatus(HarStatus h) throws IOException {
+    final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName);
+    FileStatus underlying = metadata.getPartFileStatus(p);
 
     long modTime = 0;
     int version = metadata.getVersion();
@@ -658,7 +640,7 @@ public class HarFileSystem extends FileSystem {
   @Override
   public FileStatus getFileStatus(Path f) throws IOException {
     HarStatus hstatus = getFileHarStatus(f);
-    return toFileStatus(hstatus, null);
+    return toFileStatus(hstatus);
   }
 
   private HarStatus getFileHarStatus(Path f) throws IOException {
@@ -815,7 +797,7 @@ public class HarFileSystem extends FileSystem {
     if (hstatus.isDir()) {
       fileStatusesInIndex(hstatus, statuses);
     } else {
-      statuses.add(toFileStatus(hstatus, null));
+      statuses.add(toFileStatus(hstatus));
     }
     
     return statuses.toArray(new FileStatus[statuses.size()]);
@@ -1143,7 +1125,8 @@ public class HarFileSystem extends FileSystem {
 
     List<Store> stores = new ArrayList<Store>();
     Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
-    private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
+    // keys are always the internal har path.
+    private Map<Path, FileStatus> partFileStatuses = new ConcurrentHashMap<>();
 
     public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
       this.fs = fs;
@@ -1151,16 +1134,23 @@ public class HarFileSystem extends FileSystem {
       this.archiveIndexPath = archiveIndexPath;
     }
 
-    public FileStatus getPartFileStatus(Path partPath) throws IOException {
+    public FileStatus getPartFileStatus(Path path) throws IOException {
+      Path partPath = getPathInHar(path);
       FileStatus status;
       status = partFileStatuses.get(partPath);
       if (status == null) {
-        status = fs.getFileStatus(partPath);
+        status = fs.getFileStatus(path);
         partFileStatuses.put(partPath, status);
       }
       return status;
     }
 
+    private void addPartFileStatuses(Path path) throws IOException {
+      for (FileStatus stat : fs.listStatus(path)) {
+        partFileStatuses.put(getPathInHar(stat.getPath()), stat);
+      }
+    }
+
     public long getMasterIndexTimestamp() {
       return masterIndexTimestamp;
     }
@@ -1217,16 +1207,22 @@ public class HarFileSystem extends FileSystem {
       try {
         FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
         archiveIndexTimestamp = archiveStat.getModificationTime();
-        LineReader aLin;
+
+        // pre-populate part cache.
+        addPartFileStatuses(archiveIndexPath.getParent());
+        LineReader aLin = null;
 
         // now start reading the real index file
+        long pos = -1;
         for (Store s: stores) {
-          read = 0;
-          aIn.seek(s.begin);
-          aLin = new LineReader(aIn, getConf());
-          while (read + s.begin < s.end) {
-            int tmp = aLin.readLine(line);
-            read += tmp;
+          if (pos != s.begin) {
+            pos = s.begin;
+            aIn.seek(s.begin);
+            aLin = new LineReader(aIn, getConf());
+          }
+
+          while (pos < s.end) {
+            pos += aLin.readLine(line);
             String lineFeed = line.toString();
             String[] parsed = lineFeed.split(" ");
             parsed[0] = decodeFileName(parsed[0]);
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
index 8050ce6..711ab94 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
@@ -41,7 +41,6 @@ import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 
 import static org.apache.hadoop.fs.Options.ChecksumOpt;
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
index c58e731..6415df6 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java
@@ -33,7 +33,10 @@ import java.net.URI;
 import java.util.HashSet;
 import java.util.Set;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
 
 /**
  * This test class checks basic operations with {@link HarFileSystem} including


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org