You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cm...@apache.org on 2013/09/25 22:51:10 UTC
svn commit: r1526297 - in
/hadoop/common/trunk/hadoop-common-project/hadoop-common: CHANGES.txt
src/main/java/org/apache/hadoop/fs/Globber.java
Author: cmccabe
Date: Wed Sep 25 20:51:09 2013
New Revision: 1526297
URL: http://svn.apache.org/r1526297
Log:
HADOOP-9981. globStatus should minimize its listStatus and getFileStatus calls. (Contributed by Colin Patrick McCabe)
Modified:
hadoop/common/trunk/hadoop-common-project/hadoop-common/CHANGES.txt
hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java
Modified: hadoop/common/trunk/hadoop-common-project/hadoop-common/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-common-project/hadoop-common/CHANGES.txt?rev=1526297&r1=1526296&r2=1526297&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-common-project/hadoop-common/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-common-project/hadoop-common/CHANGES.txt Wed Sep 25 20:51:09 2013
@@ -369,6 +369,9 @@ Release 2.3.0 - UNRELEASED
HADOOP-9791. Add a test case covering long paths for new FileUtil access
check methods (ivanmi)
+ HADOOP-9981. globStatus should minimize its listStatus and getFileStatus
+ calls. (Contributed by Colin Patrick McCabe)
+
Release 2.2.0 - UNRELEASED
INCOMPATIBLE CHANGES
Modified: hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java?rev=1526297&r1=1526296&r2=1526297&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java (original)
+++ hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java Wed Sep 25 20:51:09 2013
@@ -84,6 +84,15 @@ class Globber {
}
/**
+ * Convert a path component that contains backslash ecape sequences to a
+ * literal string. This is necessary when you want to explicitly refer to a
+ * path that contains globber metacharacters.
+ */
+ private static String unescapePathComponent(String name) {
+ return name.replaceAll("\\\\(.)", "$1");
+ }
+
+ /**
* Translate an absolute path into a list of path components.
* We merge double slashes into a single slash here.
* POSIX root path, i.e. '/', does not get an entry in the list.
@@ -166,37 +175,72 @@ class Globber {
new Path(scheme, authority, Path.SEPARATOR)));
}
- for (String component : components) {
+ for (int componentIdx = 0; componentIdx < components.size();
+ componentIdx++) {
ArrayList<FileStatus> newCandidates =
new ArrayList<FileStatus>(candidates.size());
- GlobFilter globFilter = new GlobFilter(component);
+ GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
+ String component = unescapePathComponent(components.get(componentIdx));
if (globFilter.hasPattern()) {
sawWildcard = true;
}
if (candidates.isEmpty() && sawWildcard) {
+ // Optimization: if there are no more candidates left, stop examining
+ // the path components. We can only do this if we've already seen
+ // a wildcard component-- otherwise, we still need to visit all path
+ // components in case one of them is a wildcard.
break;
}
- for (FileStatus candidate : candidates) {
- FileStatus resolvedCandidate = candidate;
- if (candidate.isSymlink()) {
- // We have to resolve symlinks, because otherwise we don't know
- // whether they are directories.
- resolvedCandidate = getFileStatus(candidate.getPath());
+ if ((componentIdx < components.size() - 1) &&
+ (!globFilter.hasPattern())) {
+ // Optimization: if this is not the terminal path component, and we
+ // are not matching against a glob, assume that it exists. If it
+ // doesn't exist, we'll find out later when resolving a later glob
+ // or the terminal path component.
+ for (FileStatus candidate : candidates) {
+ candidate.setPath(new Path(candidate.getPath(), component));
}
- if (resolvedCandidate == null ||
- resolvedCandidate.isDirectory() == false) {
- continue;
- }
- FileStatus[] children = listStatus(candidate.getPath());
- for (FileStatus child : children) {
- // Set the child path based on the parent path.
- // This keeps the symlinks in our path.
- child.setPath(new Path(candidate.getPath(),
- child.getPath().getName()));
- if (globFilter.accept(child.getPath())) {
- newCandidates.add(child);
+ continue;
+ }
+ for (FileStatus candidate : candidates) {
+ if (globFilter.hasPattern()) {
+ FileStatus[] children = listStatus(candidate.getPath());
+ if (children.length == 1) {
+ // If we get back only one result, this could be either a listing
+ // of a directory with one entry, or it could reflect the fact
+ // that what we listed resolved to a file.
+ //
+ // Unfortunately, we can't just compare the returned paths to
+ // figure this out. Consider the case where you have /a/b, where
+ // b is a symlink to "..". In that case, listing /a/b will give
+ // back "/a/b" again. If we just went by returned pathname, we'd
+ // incorrectly conclude that /a/b was a file and should not match
+ // /a/*/*. So we use getFileStatus of the path we just listed to
+ // disambiguate.
+ if (!getFileStatus(candidate.getPath()).isDirectory()) {
+ continue;
+ }
}
- }
+ for (FileStatus child : children) {
+ // Set the child path based on the parent path.
+ child.setPath(new Path(candidate.getPath(),
+ child.getPath().getName()));
+ if (globFilter.accept(child.getPath())) {
+ newCandidates.add(child);
+ }
+ }
+ } else {
+ // When dealing with non-glob components, use getFileStatus
+ // instead of listStatus. This is an optimization, but it also
+ // is necessary for correctness in HDFS, since there are some
+ // special HDFS directories like .reserved and .snapshot that are
+ // not visible to listStatus, but which do exist. (See HADOOP-9877)
+ FileStatus childStatus = getFileStatus(
+ new Path(candidate.getPath(), component));
+ if (childStatus != null) {
+ newCandidates.add(childStatus);
+ }
+ }
}
candidates = newCandidates;
}