You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by as...@apache.org on 2017/08/24 19:36:06 UTC
[13/50] [abbrv] hadoop git commit: YARN-3254. HealthReport should
include disk full information. Contributed by Suma Shivaprasad.
YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/f9a0e233
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/f9a0e233
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/f9a0e233
Branch: refs/heads/YARN-5972
Commit: f9a0e2338150f1bd3ba2c29f76979183fd3ed80c
Parents: 1f04cb4
Author: Sunil G <su...@apache.org>
Authored: Thu Aug 17 15:07:15 2017 +0530
Committer: Sunil G <su...@apache.org>
Committed: Thu Aug 17 15:07:15 2017 +0530
----------------------------------------------------------------------
.../server/nodemanager/DirectoryCollection.java | 61 +++++++++++++++++++-
.../nodemanager/LocalDirsHandlerService.java | 59 +++++++++++++++----
.../nodemanager/TestDirectoryCollection.java | 23 ++++++++
3 files changed, 130 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
index ae2a4ef..502485f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
@@ -38,6 +38,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
@@ -99,6 +100,7 @@ public class DirectoryCollection {
private List<String> localDirs;
private List<String> errorDirs;
private List<String> fullDirs;
+ private Map<String, DiskErrorInformation> directoryErrorInfo;
// read/write lock for accessing above directories.
private final ReadLock readLock;
@@ -192,6 +194,7 @@ public class DirectoryCollection {
localDirs = new CopyOnWriteArrayList<>(dirs);
errorDirs = new CopyOnWriteArrayList<>();
fullDirs = new CopyOnWriteArrayList<>();
+ directoryErrorInfo = new ConcurrentHashMap<>();
ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
this.readLock = lock.readLock();
@@ -248,11 +251,25 @@ public class DirectoryCollection {
/**
* @return the directories that have used all disk space
*/
-
List<String> getFullDirs() {
this.readLock.lock();
try {
- return fullDirs;
+ return Collections.unmodifiableList(fullDirs);
+ } finally {
+ this.readLock.unlock();
+ }
+ }
+
+ /**
+ * @return the directories that have errors - many not have appropriate permissions
+ * or other disk validation checks might have failed in {@link DiskValidator}
+ *
+ */
+ @InterfaceStability.Evolving
+ List<String> getErroredDirs() {
+ this.readLock.lock();
+ try {
+ return Collections.unmodifiableList(errorDirs);
} finally {
this.readLock.unlock();
}
@@ -271,6 +288,39 @@ public class DirectoryCollection {
}
/**
+ *
+ * @param dirName Absolute path of Directory for which error diagnostics are needed
+ * @return DiskErrorInformation - disk error diagnostics for the specified directory
+ * null - the disk associated with the directory has passed disk utilization checks
+ * /error validations in {@link DiskValidator}
+ *
+ */
+ @InterfaceStability.Evolving
+ DiskErrorInformation getDirectoryErrorInfo(String dirName) {
+ this.readLock.lock();
+ try {
+ return directoryErrorInfo.get(dirName);
+ } finally {
+ this.readLock.unlock();
+ }
+ }
+
+ /**
+ *
+ * @param dirName Absolute path of Directory for which the disk has been marked as unhealthy
+ * @return Check if disk associated with the directory is unhealthy
+ */
+ @InterfaceStability.Evolving
+ boolean isDiskUnHealthy(String dirName) {
+ this.readLock.lock();
+ try {
+ return directoryErrorInfo.containsKey(dirName);
+ } finally {
+ this.readLock.unlock();
+ }
+ }
+
+ /**
* Create any non-existent directories and parent directories, updating the
* list of valid directories if necessary.
* @param localFs local file system to use
@@ -297,6 +347,9 @@ public class DirectoryCollection {
try {
localDirs.remove(dir);
errorDirs.add(dir);
+ directoryErrorInfo.put(dir,
+ new DiskErrorInformation(DiskErrorCause.OTHER,
+ "Cannot create directory : " + dir + ", error " + e.getMessage()));
numFailures++;
} finally {
this.writeLock.unlock();
@@ -343,11 +396,13 @@ public class DirectoryCollection {
localDirs.clear();
errorDirs.clear();
fullDirs.clear();
+ directoryErrorInfo.clear();
for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
.entrySet()) {
String dir = entry.getKey();
DiskErrorInformation errorInformation = entry.getValue();
+
switch (entry.getValue().cause) {
case DISK_FULL:
fullDirs.add(entry.getKey());
@@ -359,6 +414,8 @@ public class DirectoryCollection {
LOG.warn(entry.getValue().cause + " is unknown for disk error.");
break;
}
+ directoryErrorInfo.put(entry.getKey(), errorInformation);
+
if (preCheckGoodDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error, " + errorInformation.message
+ ", removing from list of valid directories");
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
index f8cb4ee..6e00808 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
@@ -53,6 +53,8 @@ public class LocalDirsHandlerService extends AbstractService {
private static Log LOG = LogFactory.getLog(LocalDirsHandlerService.class);
+ private static final String diskCapacityExceededErrorMsg = "usable space is below configured utilization percentage/no more usable space";
+
/**
* Good local directories, use internally,
* initial value is the same as NM_LOCAL_DIRS.
@@ -344,21 +346,36 @@ public class LocalDirsHandlerService extends AbstractService {
}
StringBuilder report = new StringBuilder();
- List<String> failedLocalDirsList = localDirs.getFailedDirs();
- List<String> failedLogDirsList = logDirs.getFailedDirs();
+ List<String> erroredLocalDirsList = localDirs.getErroredDirs();
+ List<String> erroredLogDirsList = logDirs.getErroredDirs();
+ List<String> diskFullLocalDirsList = localDirs.getFullDirs();
+ List<String> diskFullLogDirsList = logDirs.getFullDirs();
List<String> goodLocalDirsList = localDirs.getGoodDirs();
List<String> goodLogDirsList = logDirs.getGoodDirs();
- int numLocalDirs = goodLocalDirsList.size() + failedLocalDirsList.size();
- int numLogDirs = goodLogDirsList.size() + failedLogDirsList.size();
+
+ int numLocalDirs = goodLocalDirsList.size() + erroredLocalDirsList.size() + diskFullLocalDirsList.size();
+ int numLogDirs = goodLogDirsList.size() + erroredLogDirsList.size() + diskFullLogDirsList.size();
if (!listGoodDirs) {
- if (!failedLocalDirsList.isEmpty()) {
- report.append(failedLocalDirsList.size() + "/" + numLocalDirs
- + " local-dirs are bad: "
- + StringUtils.join(",", failedLocalDirsList) + "; ");
+ if (!erroredLocalDirsList.isEmpty()) {
+ report.append(erroredLocalDirsList.size() + "/" + numLocalDirs
+ + " local-dirs have errors: "
+ + buildDiskErrorReport(erroredLocalDirsList, localDirs));
+ }
+ if (!diskFullLocalDirsList.isEmpty()) {
+ report.append(diskFullLocalDirsList.size() + "/" + numLocalDirs
+ + " local-dirs " + diskCapacityExceededErrorMsg
+ + buildDiskErrorReport(diskFullLocalDirsList, localDirs) + "; ");
}
- if (!failedLogDirsList.isEmpty()) {
- report.append(failedLogDirsList.size() + "/" + numLogDirs
- + " log-dirs are bad: " + StringUtils.join(",", failedLogDirsList));
+
+ if (!erroredLogDirsList.isEmpty()) {
+ report.append(erroredLogDirsList.size() + "/" + numLogDirs
+ + " log-dirs have errors: "
+ + buildDiskErrorReport(erroredLogDirsList, logDirs));
+ }
+ if (!diskFullLogDirsList.isEmpty()) {
+ report.append(diskFullLogDirsList.size() + "/" + numLogDirs
+ + " log-dirs " + diskCapacityExceededErrorMsg
+ + buildDiskErrorReport(diskFullLogDirsList, logDirs));
}
} else {
report.append(goodLocalDirsList.size() + "/" + numLocalDirs
@@ -620,4 +637,24 @@ public class LocalDirsHandlerService extends AbstractService {
logDirs.getGoodDirsDiskUtilizationPercentage());
}
}
+
+ private String buildDiskErrorReport(List<String> dirs, DirectoryCollection directoryCollection) {
+ StringBuilder sb = new StringBuilder();
+
+ sb.append(" [ ");
+ for (int i = 0; i < dirs.size(); i++) {
+ final String dirName = dirs.get(i);
+ if ( directoryCollection.isDiskUnHealthy(dirName)) {
+ sb.append(dirName + " : " + directoryCollection.getDirectoryErrorInfo(dirName).message);
+ } else {
+ sb.append(dirName + " : " + "Unknown cause for disk error");
+ }
+
+ if ( i != (dirs.size() - 1)) {
+ sb.append(" , ");
+ }
+ }
+ sb.append(" ] ");
+ return sb.toString();
+ }
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f9a0e233/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
index e529628..095f21a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
@@ -128,8 +128,12 @@ public class TestDirectoryCollection {
DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F);
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
+ Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
+ Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
+
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
@@ -139,16 +143,21 @@ public class TestDirectoryCollection {
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
+ Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
+
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
+ Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
@@ -158,8 +167,11 @@ public class TestDirectoryCollection {
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
+ Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
+
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
}
@@ -209,12 +221,17 @@ public class TestDirectoryCollection {
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
+ Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
+ Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
+ Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
@@ -232,12 +249,18 @@ public class TestDirectoryCollection {
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
+ Assert.assertEquals(1, dc.getErroredDirs().size());
+ Assert.assertNotNull(dc.getDirectoryErrorInfo(dirB));
+ Assert.assertEquals(DirectoryCollection.DiskErrorCause.OTHER, dc.getDirectoryErrorInfo(dirB).cause);
+
permDirB = new FsPermission((short) 0700);
localFs.setPermission(pathB, permDirB);
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
+ Assert.assertEquals(0, dc.getErroredDirs().size());
+ Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org