You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cm...@apache.org on 2013/09/25 22:58:37 UTC

svn commit: r1526300 - in /hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common: CHANGES.txt src/main/java/org/apache/hadoop/fs/Globber.java

Author: cmccabe
Date: Wed Sep 25 20:58:37 2013
New Revision: 1526300

URL: http://svn.apache.org/r1526300
Log:
HADOOP-9981. globStatus should minimize its listStatus and getFileStatus calls.  (Contributed by Colin Patrick McCabe)

Modified:
    hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/CHANGES.txt
    hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java

Modified: hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/CHANGES.txt?rev=1526300&r1=1526299&r2=1526300&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/CHANGES.txt Wed Sep 25 20:58:37 2013
@@ -81,6 +81,9 @@ Release 2.3.0 - UNRELEASED
     HADOOP-9791. Add a test case covering long paths for new FileUtil access
     check methods (ivanmi)
 
+    HADOOP-9981. globStatus should minimize its listStatus and getFileStatus
+    calls.  (Contributed by Colin Patrick McCabe)
+
 Release 2.2.0 - UNRELEASED
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java?rev=1526300&r1=1526299&r2=1526300&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java (original)
+++ hadoop/common/branches/branch-2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java Wed Sep 25 20:58:37 2013
@@ -84,6 +84,15 @@ class Globber {
   }
 
   /**
+   * Convert a path component that contains backslash ecape sequences to a
+   * literal string.  This is necessary when you want to explicitly refer to a
+   * path that contains globber metacharacters.
+   */
+  private static String unescapePathComponent(String name) {
+    return name.replaceAll("\\\\(.)", "$1");
+  }
+
+  /**
    * Translate an absolute path into a list of path components.
    * We merge double slashes into a single slash here.
    * POSIX root path, i.e. '/', does not get an entry in the list.
@@ -166,37 +175,72 @@ class Globber {
             new Path(scheme, authority, Path.SEPARATOR)));
       }
       
-      for (String component : components) {
+      for (int componentIdx = 0; componentIdx < components.size();
+          componentIdx++) {
         ArrayList<FileStatus> newCandidates =
             new ArrayList<FileStatus>(candidates.size());
-        GlobFilter globFilter = new GlobFilter(component);
+        GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
+        String component = unescapePathComponent(components.get(componentIdx));
         if (globFilter.hasPattern()) {
           sawWildcard = true;
         }
         if (candidates.isEmpty() && sawWildcard) {
+          // Optimization: if there are no more candidates left, stop examining 
+          // the path components.  We can only do this if we've already seen
+          // a wildcard component-- otherwise, we still need to visit all path 
+          // components in case one of them is a wildcard.
           break;
         }
-        for (FileStatus candidate : candidates) {
-          FileStatus resolvedCandidate = candidate;
-          if (candidate.isSymlink()) {
-            // We have to resolve symlinks, because otherwise we don't know
-            // whether they are directories.
-            resolvedCandidate = getFileStatus(candidate.getPath());
+        if ((componentIdx < components.size() - 1) &&
+            (!globFilter.hasPattern())) {
+          // Optimization: if this is not the terminal path component, and we 
+          // are not matching against a glob, assume that it exists.  If it 
+          // doesn't exist, we'll find out later when resolving a later glob
+          // or the terminal path component.
+          for (FileStatus candidate : candidates) {
+            candidate.setPath(new Path(candidate.getPath(), component));
           }
-          if (resolvedCandidate == null ||
-              resolvedCandidate.isDirectory() == false) {
-            continue;
-          }
-          FileStatus[] children = listStatus(candidate.getPath());
-          for (FileStatus child : children) {
-            // Set the child path based on the parent path.
-            // This keeps the symlinks in our path.
-            child.setPath(new Path(candidate.getPath(),
-                    child.getPath().getName()));
-            if (globFilter.accept(child.getPath())) {
-              newCandidates.add(child);
+          continue;
+        }
+        for (FileStatus candidate : candidates) {
+          if (globFilter.hasPattern()) {
+            FileStatus[] children = listStatus(candidate.getPath());
+            if (children.length == 1) {
+              // If we get back only one result, this could be either a listing
+              // of a directory with one entry, or it could reflect the fact
+              // that what we listed resolved to a file.
+              //
+              // Unfortunately, we can't just compare the returned paths to
+              // figure this out.  Consider the case where you have /a/b, where
+              // b is a symlink to "..".  In that case, listing /a/b will give
+              // back "/a/b" again.  If we just went by returned pathname, we'd
+              // incorrectly conclude that /a/b was a file and should not match
+              // /a/*/*.  So we use getFileStatus of the path we just listed to
+              // disambiguate.
+              if (!getFileStatus(candidate.getPath()).isDirectory()) {
+                continue;
+              }
             }
-          }
+            for (FileStatus child : children) {
+              // Set the child path based on the parent path.
+              child.setPath(new Path(candidate.getPath(),
+                      child.getPath().getName()));
+              if (globFilter.accept(child.getPath())) {
+                newCandidates.add(child);
+              }
+            }
+          } else {
+            // When dealing with non-glob components, use getFileStatus 
+            // instead of listStatus.  This is an optimization, but it also
+            // is necessary for correctness in HDFS, since there are some
+            // special HDFS directories like .reserved and .snapshot that are
+            // not visible to listStatus, but which do exist.  (See HADOOP-9877)
+            FileStatus childStatus = getFileStatus(
+                new Path(candidate.getPath(), component));
+            if (childStatus != null) {
+              newCandidates.add(childStatus);
+             }
+           }
         }
         candidates = newCandidates;
       }