You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by ma...@apache.org on 2011/11/30 00:18:10 UTC

svn commit: r1208131 [3/3] - in /hadoop/common/trunk/hadoop-mapreduce-project: ./ conf/ hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapred/ hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/ja...

Added: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java?rev=1208131&view=auto
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java (added)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeHealthService.java Tue Nov 29 23:17:54 2011
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.TimerTask;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestNodeHealthService {
+
+  private static volatile Log LOG = LogFactory
+      .getLog(TestNodeHealthService.class);
+
+  protected static File testRootDir = new File("target",
+      TestNodeHealthService.class.getName() + "-localDir").getAbsoluteFile();
+
+  final static File nodeHealthConfigFile = new File(testRootDir,
+      "modified-mapred-site.xml");
+
+  private File nodeHealthscriptFile = new File(testRootDir,
+      "failingscript.sh");
+
+  @Before
+  public void setup() {
+    testRootDir.mkdirs();
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    if (testRootDir.exists()) {
+      FileContext.getLocalFSFileContext().delete(
+          new Path(testRootDir.getAbsolutePath()), true);
+    }
+  }
+
+  private Configuration getConfForNodeHealthScript() {
+    Configuration conf = new Configuration();
+    conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH,
+        nodeHealthscriptFile.getAbsolutePath());
+    conf.setLong(YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, 500);
+    conf.setLong(
+        YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, 1000);
+    return conf;
+  }
+
+  private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable)
+      throws IOException {
+    PrintWriter pw = new PrintWriter(new FileOutputStream(nodeHealthscriptFile));
+    pw.println(scriptStr);
+    pw.flush();
+    pw.close();
+    nodeHealthscriptFile.setExecutable(setExecutable);
+  }
+
+  @Test
+  public void testNodeHealthScriptShouldRun() throws IOException {
+    // Node health script should not start if there is no property called
+    // node health script path.
+    Assert.assertFalse("By default Health script should not have started",
+        NodeHealthScriptRunner.shouldRun(new Configuration()));
+    Configuration conf = getConfForNodeHealthScript();
+    // Node health script should not start if the node health script does not
+    // exists
+    Assert.assertFalse("Node health script should start",
+        NodeHealthScriptRunner.shouldRun(conf));
+    // Create script path.
+    conf.writeXml(new FileOutputStream(nodeHealthConfigFile));
+    conf.addResource(nodeHealthConfigFile.getName());
+    writeNodeHealthScriptFile("", false);
+    // Node health script should not start if the node health script is not
+    // executable.
+    Assert.assertFalse("Node health script should start",
+        NodeHealthScriptRunner.shouldRun(conf));
+    writeNodeHealthScriptFile("", true);
+    Assert.assertTrue("Node health script should start",
+        NodeHealthScriptRunner.shouldRun(conf));
+  }
+
+  private void setHealthStatus(NodeHealthStatus healthStatus, boolean isHealthy,
+      String healthReport, long lastHealthReportTime) {
+    healthStatus.setHealthReport(healthReport);
+    healthStatus.setIsNodeHealthy(isHealthy);
+    healthStatus.setLastHealthReportTime(lastHealthReportTime);
+  }
+
+  @Test
+  public void testNodeHealthScript() throws Exception {
+    RecordFactory factory = RecordFactoryProvider.getRecordFactory(null);
+    NodeHealthStatus healthStatus =
+        factory.newRecordInstance(NodeHealthStatus.class);
+    String errorScript = "echo ERROR\n echo \"Tracker not healthy\"";
+    String normalScript = "echo \"I am all fine\"";
+    String timeOutScript = "sleep 4\n echo\"I am fine\"";
+    Configuration conf = getConfForNodeHealthScript();
+    conf.writeXml(new FileOutputStream(nodeHealthConfigFile));
+    conf.addResource(nodeHealthConfigFile.getName());
+
+    writeNodeHealthScriptFile(normalScript, true);
+    NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService();
+    nodeHealthChecker.init(conf);
+    NodeHealthScriptRunner nodeHealthScriptRunner =
+        nodeHealthChecker.getNodeHealthScriptRunner();
+    TimerTask timerTask = nodeHealthScriptRunner.getTimerTask();
+
+    timerTask.run();
+
+    setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(),
+        nodeHealthChecker.getHealthReport(),
+        nodeHealthChecker.getLastHealthReportTime());
+    LOG.info("Checking initial healthy condition");
+    // Check proper report conditions.
+    Assert.assertTrue("Node health status reported unhealthy", healthStatus
+        .getIsNodeHealthy());
+    Assert.assertTrue("Node health status reported unhealthy", healthStatus
+        .getHealthReport().equals(nodeHealthChecker.getHealthReport()));
+
+    // write out error file.
+    // Healthy to unhealthy transition
+    writeNodeHealthScriptFile(errorScript, true);
+    // Run timer
+    timerTask.run();
+    // update health status
+    setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(),
+        nodeHealthChecker.getHealthReport(),
+        nodeHealthChecker.getLastHealthReportTime());
+    LOG.info("Checking Healthy--->Unhealthy");
+    Assert.assertFalse("Node health status reported healthy", healthStatus
+        .getIsNodeHealthy());
+    Assert.assertTrue("Node health status reported healthy", healthStatus
+        .getHealthReport().equals(nodeHealthChecker.getHealthReport()));
+    
+    // Check unhealthy to healthy transitions.
+    writeNodeHealthScriptFile(normalScript, true);
+    timerTask.run();
+    setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(),
+        nodeHealthChecker.getHealthReport(),
+        nodeHealthChecker.getLastHealthReportTime());
+    LOG.info("Checking UnHealthy--->healthy");
+    // Check proper report conditions.
+    Assert.assertTrue("Node health status reported unhealthy", healthStatus
+        .getIsNodeHealthy());
+    Assert.assertTrue("Node health status reported unhealthy", healthStatus
+        .getHealthReport().equals(nodeHealthChecker.getHealthReport()));
+
+    // Healthy to timeout transition.
+    writeNodeHealthScriptFile(timeOutScript, true);
+    timerTask.run();
+    setHealthStatus(healthStatus, nodeHealthChecker.isHealthy(),
+        nodeHealthChecker.getHealthReport(),
+        nodeHealthChecker.getLastHealthReportTime());
+    LOG.info("Checking Healthy--->timeout");
+    Assert.assertFalse("Node health status reported healthy even after timeout",
+        healthStatus.getIsNodeHealthy());
+    Assert.assertTrue("Node script time out message not propogated",
+        healthStatus.getHealthReport().equals(
+            NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG
+            + NodeHealthCheckerService.SEPARATOR
+            + nodeHealthChecker.getDiskHandler().getDisksHealthReport()));
+  }
+
+}

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java Tue Nov 29 23:17:54 2011
@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentMa
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.NodeHealthCheckerService;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.Path;
@@ -440,10 +439,11 @@ public class TestNodeStatusUpdater {
           ContainerExecutor exec, DeletionService del,
           NodeStatusUpdater nodeStatusUpdater,
           ContainerTokenSecretManager containerTokenSecretManager,
-          ApplicationACLsManager aclsManager) {
+          ApplicationACLsManager aclsManager,
+          LocalDirsHandlerService diskhandler) {
         return new ContainerManagerImpl(context, exec, del,
             nodeStatusUpdater, metrics, containerTokenSecretManager,
-            aclsManager) {
+            aclsManager, diskhandler) {
           @Override
           public void start() {
             // Simulating failure of starting RPC server

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java Tue Nov 29 23:17:54 2011
@@ -45,7 +45,9 @@ import org.apache.hadoop.yarn.server.nod
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
 import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
@@ -94,6 +96,8 @@ public abstract class BaseContainerManag
   protected ContainerExecutor exec;
   protected DeletionService delSrvc;
   protected String user = "nobody";
+  protected NodeHealthCheckerService nodeHealthChecker;
+  protected LocalDirsHandlerService dirsHandler;
 
   protected NodeStatusUpdater nodeStatusUpdater = new NodeStatusUpdaterImpl(
       context, new AsyncDispatcher(), null, metrics, this.containerTokenSecretManager) {
@@ -147,9 +151,12 @@ public abstract class BaseContainerManag
     delSrvc.init(conf);
 
     exec = createContainerExecutor();
+    nodeHealthChecker = new NodeHealthCheckerService();
+    nodeHealthChecker.init(conf);
+    dirsHandler = nodeHealthChecker.getDiskHandler();
     containerManager = new ContainerManagerImpl(context, exec, delSrvc,
         nodeStatusUpdater, metrics, this.containerTokenSecretManager,
-        new ApplicationACLsManager(conf));
+        new ApplicationACLsManager(conf), dirsHandler);
     containerManager.init(conf);
   }
 

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java Tue Nov 29 23:17:54 2011
@@ -383,11 +383,12 @@ public class TestContainerManager extend
     // Real del service
     delSrvc = new DeletionService(exec);
     delSrvc.init(conf);
+
     ContainerTokenSecretManager containerTokenSecretManager = new 
         ContainerTokenSecretManager();
     containerManager = new ContainerManagerImpl(context, exec, delSrvc,
         nodeStatusUpdater, metrics, containerTokenSecretManager,
-        new ApplicationACLsManager(conf));
+        new ApplicationACLsManager(conf), dirsHandler);
     containerManager.init(conf);
     containerManager.start();
 

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java Tue Nov 29 23:17:54 2011
@@ -25,6 +25,7 @@ import static org.mockito.Mockito.reset;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
+import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.ByteBuffer;
 import java.util.AbstractMap.SimpleEntry;
@@ -649,7 +650,8 @@ public class TestContainer {
 
     public void containerFailed(int exitCode) {
       c.handle(new ContainerExitEvent(cId,
-          ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, exitCode));
+          ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, exitCode,
+          "Container completed with exit code " + exitCode));
       drainDispatcherEvents();
     }
 
@@ -659,9 +661,10 @@ public class TestContainer {
     }
 
     public void containerKilledOnRequest() {
+      int exitCode = ExitCode.FORCE_KILLED.getExitCode();
       c.handle(new ContainerExitEvent(cId,
-          ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.FORCE_KILLED
-              .getExitCode()));
+          ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode,
+          "Container completed with exit code " + exitCode));
       drainDispatcherEvents();
     }
     

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/TestResourceLocalizationService.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/TestResourceLocalizationService.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/TestResourceLocalizationService.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/TestResourceLocalizationService.java Tue Nov 29 23:17:54 2011
@@ -59,6 +59,8 @@ import org.apache.hadoop.yarn.event.Drai
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
 import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
 import org.apache.hadoop.yarn.server.nodemanager.api.protocolrecords.LocalResourceStatus;
 import org.apache.hadoop.yarn.server.nodemanager.api.protocolrecords.LocalizerAction;
 import org.apache.hadoop.yarn.server.nodemanager.api.protocolrecords.LocalizerHeartbeatResponse;
@@ -109,19 +111,23 @@ public class TestResourceLocalizationSer
     doNothing().when(spylfs).mkdir(
         isA(Path.class), isA(FsPermission.class), anyBoolean());
 
+    List<Path> localDirs = new ArrayList<Path>();
+    String[] sDirs = new String[4];
+    for (int i = 0; i < 4; ++i) {
+      localDirs.add(lfs.makeQualified(new Path(basedir, i + "")));
+      sDirs[i] = localDirs.get(i).toString();
+    }
+    conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, sDirs);
+    LocalDirsHandlerService diskhandler = new LocalDirsHandlerService();
+    diskhandler.init(conf);
+
     ResourceLocalizationService locService =
-      spy(new ResourceLocalizationService(dispatcher, exec, delService));
+      spy(new ResourceLocalizationService(dispatcher, exec, delService,
+                                          diskhandler));
     doReturn(lfs)
       .when(locService).getLocalFileContext(isA(Configuration.class));
     try {
       dispatcher.start();
-      List<Path> localDirs = new ArrayList<Path>();
-      String[] sDirs = new String[4];
-      for (int i = 0; i < 4; ++i) {
-        localDirs.add(lfs.makeQualified(new Path(basedir, i + "")));
-        sDirs[i] = localDirs.get(i).toString();
-      }
-      conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, sDirs);
 
       // initialize ResourceLocalizationService
       locService.init(conf);
@@ -176,12 +182,16 @@ public class TestResourceLocalizationSer
     dispatcher.register(LocalizerEventType.class, localizerBus);
 
     ContainerExecutor exec = mock(ContainerExecutor.class);
+    LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
+    dirsHandler.init(conf);
+
     DeletionService delService = new DeletionService(exec);
     delService.init(null);
     delService.start();
 
     ResourceLocalizationService rawService =
-      new ResourceLocalizationService(dispatcher, exec, delService);
+      new ResourceLocalizationService(dispatcher, exec, delService,
+                                      dirsHandler);
     ResourceLocalizationService spyService = spy(rawService);
     doReturn(ignore).when(spyService).createServer();
     doReturn(mockLocallilzerTracker).when(spyService).createLocalizerTracker(
@@ -356,13 +366,17 @@ public class TestResourceLocalizationSer
     dispatcher.register(ContainerEventType.class, containerBus);
 
     ContainerExecutor exec = mock(ContainerExecutor.class);
+    LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
+    dirsHandler.init(conf);
+
     DeletionService delServiceReal = new DeletionService(exec);
     DeletionService delService = spy(delServiceReal);
     delService.init(null);
     delService.start();
 
     ResourceLocalizationService rawService =
-      new ResourceLocalizationService(dispatcher, exec, delService);
+      new ResourceLocalizationService(dispatcher, exec, delService,
+                                      dirsHandler);
     ResourceLocalizationService spyService = spy(rawService);
     doReturn(ignore).when(spyService).createServer();
     doReturn(lfs).when(spyService).getLocalFileContext(isA(Configuration.class));
@@ -414,8 +428,9 @@ public class TestResourceLocalizationSer
       String appStr = ConverterUtils.toString(appId);
       String ctnrStr = c.getContainerID().toString();
       ArgumentCaptor<Path> tokenPathCaptor = ArgumentCaptor.forClass(Path.class);
-      verify(exec).startLocalizer(tokenPathCaptor.capture(), isA(InetSocketAddress.class),
-        eq("user0"), eq(appStr), eq(ctnrStr), isA(List.class));
+      verify(exec).startLocalizer(tokenPathCaptor.capture(),
+          isA(InetSocketAddress.class), eq("user0"), eq(appStr), eq(ctnrStr),
+          isA(List.class), isA(List.class));
       Path localizationTokenPath = tokenPathCaptor.getValue();
 
       // heartbeat from localizer

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java Tue Nov 29 23:17:54 2011
@@ -122,7 +122,8 @@ public class TestLogAggregationService e
     dispatcher.register(ApplicationEventType.class, appEventHandler);
     
     LogAggregationService logAggregationService =
-        new LogAggregationService(dispatcher, this.context, this.delSrvc);
+        new LogAggregationService(dispatcher, this.context, this.delSrvc,
+                                  super.dirsHandler);
     logAggregationService.init(this.conf);
     logAggregationService.start();
 
@@ -189,7 +190,8 @@ public class TestLogAggregationService e
     dispatcher.register(ApplicationEventType.class, appEventHandler);
     
     LogAggregationService logAggregationService =
-        new LogAggregationService(dispatcher, this.context, this.delSrvc);
+        new LogAggregationService(dispatcher, this.context, this.delSrvc,
+                                  super.dirsHandler);
     logAggregationService.init(this.conf);
     logAggregationService.start();
 
@@ -237,7 +239,8 @@ public class TestLogAggregationService e
     dispatcher.register(ApplicationEventType.class, appEventHandler);
     
     LogAggregationService logAggregationService =
-        new LogAggregationService(dispatcher, this.context, this.delSrvc);
+        new LogAggregationService(dispatcher, this.context, this.delSrvc,
+                                  super.dirsHandler);
     logAggregationService.init(this.conf);
     logAggregationService.start();
 

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/TestNonAggregatingLogHandler.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/TestNonAggregatingLogHandler.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/TestNonAggregatingLogHandler.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/TestNonAggregatingLogHandler.java Tue Nov 29 23:17:54 2011
@@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.event.Drai
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.logaggregation.ContainerLogsRetentionPolicy;
 import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppFinishedEvent;
@@ -74,13 +75,16 @@ public class TestNonAggregatingLogHandle
     EventHandler<ApplicationEvent> appEventHandler = mock(EventHandler.class);
     dispatcher.register(ApplicationEventType.class, appEventHandler);
 
+    LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
+    dirsHandler.init(conf);
+
     ApplicationId appId1 = BuilderUtils.newApplicationId(1234, 1);
     ApplicationAttemptId appAttemptId1 =
         BuilderUtils.newApplicationAttemptId(appId1, 1);
     ContainerId container11 = BuilderUtils.newContainerId(appAttemptId1, 1);
 
     NonAggregatingLogHandler logHandler =
-        new NonAggregatingLogHandler(dispatcher, delService);
+        new NonAggregatingLogHandler(dispatcher, delService, dirsHandler);
     logHandler.init(conf);
     logHandler.start();
 
@@ -146,13 +150,17 @@ public class TestNonAggregatingLogHandle
     EventHandler<ApplicationEvent> appEventHandler = mock(EventHandler.class);
     dispatcher.register(ApplicationEventType.class, appEventHandler);
 
+    LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
+    dirsHandler.init(conf);
+
     ApplicationId appId1 = BuilderUtils.newApplicationId(1234, 1);
     ApplicationAttemptId appAttemptId1 =
         BuilderUtils.newApplicationAttemptId(appId1, 1);
     ContainerId container11 = BuilderUtils.newContainerId(appAttemptId1, 1);
 
     NonAggregatingLogHandler logHandler =
-        new NonAggregatingLogHandlerWithMockExecutor(dispatcher, delService);
+        new NonAggregatingLogHandlerWithMockExecutor(dispatcher, delService,
+                                                     dirsHandler);
     logHandler.init(conf);
     logHandler.start();
 
@@ -182,8 +190,8 @@ public class TestNonAggregatingLogHandle
     private ScheduledThreadPoolExecutor mockSched;
 
     public NonAggregatingLogHandlerWithMockExecutor(Dispatcher dispatcher,
-        DeletionService delService) {
-      super(dispatcher, delService);
+        DeletionService delService, LocalDirsHandlerService dirsHandler) {
+      super(dispatcher, delService, dirsHandler);
     }
 
     @Override

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java Tue Nov 29 23:17:54 2011
@@ -27,6 +27,7 @@ import java.io.IOException;
 import java.io.Writer;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
@@ -37,6 +38,8 @@ import org.apache.hadoop.yarn.event.Disp
 import org.apache.hadoop.yarn.factories.RecordFactory;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
 import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
@@ -47,6 +50,7 @@ import org.apache.hadoop.yarn.server.nod
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.util.BuilderUtils;
 import org.apache.hadoop.yarn.util.ConverterUtils;
+import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -54,10 +58,19 @@ public class TestNMWebServer {
 
   private static final File testRootDir = new File("target",
       TestNMWebServer.class.getSimpleName());
+  private static File testLogDir = new File("target",
+      TestNMWebServer.class.getSimpleName() + "LogDir");
 
   @Before
   public void setup() {
     testRootDir.mkdirs();
+    testLogDir.mkdir(); 
+  }
+
+  @After
+  public void tearDown() {
+    FileUtil.fullyDelete(testRootDir);
+    FileUtil.fullyDelete(testLogDir);
   }
 
   @Test
@@ -74,9 +87,14 @@ public class TestNMWebServer {
       }
     };
     Configuration conf = new Configuration();
-    WebServer server = new WebServer(nmContext, resourceView,
-        new ApplicationACLsManager(conf));
     conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
+    conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
+    NodeHealthCheckerService healthChecker = new NodeHealthCheckerService();
+    healthChecker.init(conf);
+    LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
+
+    WebServer server = new WebServer(nmContext, resourceView,
+        new ApplicationACLsManager(conf), dirsHandler);
     server.init(conf);
     server.start();
 
@@ -119,20 +137,20 @@ public class TestNMWebServer {
           containerId.getApplicationAttemptId().getApplicationId();
       nmContext.getApplications().get(applicationId).getContainers()
           .put(containerId, container);
-      writeContainerLogs(conf, nmContext, containerId);
+      writeContainerLogs(nmContext, containerId, dirsHandler);
 
     }
     // TODO: Pull logs and test contents.
 //    Thread.sleep(1000000);
   }
 
-  private void writeContainerLogs(Configuration conf, Context nmContext,
-      ContainerId containerId)
+  private void writeContainerLogs(Context nmContext,
+      ContainerId containerId, LocalDirsHandlerService dirsHandler)
         throws IOException {
     // ContainerLogDir should be created
     File containerLogDir =
-        ContainerLogsPage.ContainersLogsBlock.getContainerLogDirs(conf,
-            containerId).get(0);
+        ContainerLogsPage.ContainersLogsBlock.getContainerLogDirs(containerId,
+            dirsHandler).get(0);
     containerLogDir.mkdirs();
     for (String fileType : new String[] { "stdout", "stderr", "syslog" }) {
       Writer writer = new FileWriter(new File(containerLogDir, fileType));

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java Tue Nov 29 23:17:54 2011
@@ -23,7 +23,6 @@ import java.io.IOException;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.NodeHealthCheckerService;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.Path;
@@ -41,6 +40,7 @@ import org.apache.hadoop.yarn.server.api
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
@@ -51,7 +51,6 @@ import org.apache.hadoop.yarn.server.res
 import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
 import org.apache.hadoop.yarn.service.AbstractService;
 import org.apache.hadoop.yarn.service.CompositeService;
-import org.apache.hadoop.yarn.service.Service.STATE;
 
 public class MiniYARNCluster extends CompositeService {
 
@@ -69,13 +68,23 @@ public class MiniYARNCluster extends Com
   
   private File testWorkDir;
 
-  public MiniYARNCluster(String testName) {
-    //default number of nodeManagers = 1
-    this(testName, 1);
-  }
+  // Number of nm-local-dirs per nodemanager
+  private int numLocalDirs;
+  // Number of nm-log-dirs per nodemanager
+  private int numLogDirs;
+
+  /**
+   * @param testName name of the test
+   * @param noOfNodeManagers the number of node managers in the cluster
+   * @param numLocalDirs the number of nm-local-dirs per nodemanager
+   * @param numLogDirs the number of nm-log-dirs per nodemanager
+   */
+  public MiniYARNCluster(String testName, int noOfNodeManagers,
+                         int numLocalDirs, int numLogDirs) {
 
-  public MiniYARNCluster(String testName, int noOfNodeManagers) {
     super(testName);
+    this.numLocalDirs = numLocalDirs;
+    this.numLogDirs = numLogDirs;
     this.testWorkDir = new File("target", testName);
     try {
       FileContext.getLocalFSFileContext().delete(
@@ -166,25 +175,39 @@ public class MiniYARNCluster extends Com
       super.init(config);                                                        
     }                                                                            
 
+    /**
+     * Create local/log directories
+     * @param dirType type of directories i.e. local dirs or log dirs 
+     * @param numDirs number of directories
+     * @return the created directories as a comma delimited String
+     */
+    private String prepareDirs(String dirType, int numDirs) {
+      File []dirs = new File[numDirs];
+      String dirsString = "";
+      for (int i = 0; i < numDirs; i++) {
+        dirs[i]= new File(testWorkDir, MiniYARNCluster.this.getName()
+            + "-" + dirType + "Dir-nm-" + index + "_" + i);
+        dirs[i].mkdir();
+        LOG.info("Created " + dirType + "Dir in " + dirs[i].getAbsolutePath());
+        String delimiter = (i > 0) ? "," : "";
+        dirsString = dirsString.concat(delimiter + dirs[i].getAbsolutePath());
+      }
+      return dirsString;
+    }
+
     public synchronized void start() {
       try {
-        File localDir = new File(testWorkDir, MiniYARNCluster.this.getName()
-            + "-localDir-nm-" + index);
-        localDir.mkdir();
-        LOG.info("Created localDir in " + localDir.getAbsolutePath());
-        getConfig().set(YarnConfiguration.NM_LOCAL_DIRS,
-            localDir.getAbsolutePath());
-        File logDir =
-            new File(testWorkDir, MiniYARNCluster.this.getName()
-                + "-logDir-nm-" + index);
+        // create nm-local-dirs and configure them for the nodemanager
+        String localDirsString = prepareDirs("local", numLocalDirs);
+        getConfig().set(YarnConfiguration.NM_LOCAL_DIRS, localDirsString);
+        // create nm-log-dirs and configure them for the nodemanager
+        String logDirsString = prepareDirs("log", numLogDirs);
+        getConfig().set(YarnConfiguration.NM_LOG_DIRS, logDirsString);
+
         File remoteLogDir =
             new File(testWorkDir, MiniYARNCluster.this.getName()
                 + "-remoteLogDir-nm-" + index);
-        logDir.mkdir();
         remoteLogDir.mkdir();
-        LOG.info("Created logDir in " + logDir.getAbsolutePath());
-        getConfig().set(YarnConfiguration.NM_LOG_DIRS,
-            logDir.getAbsolutePath());
         getConfig().set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
             	remoteLogDir.getAbsolutePath());
         // By default AM + 2 containers

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java Tue Nov 29 23:17:54 2011
@@ -117,7 +117,7 @@ public class TestContainerManagerSecurit
     conf.setLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 100000L);
     UserGroupInformation.setConfiguration(conf);
     yarnCluster = new MiniYARNCluster(TestContainerManagerSecurity.class
-        .getName());
+        .getName(), 1, 1, 1);
     yarnCluster.init(conf);
     yarnCluster.start();
   }

Added: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java?rev=1208131&view=auto
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java (added)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java Tue Nov 29 23:17:54 2011
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.UnsupportedFileSystemException;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.MiniYARNCluster;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+/**
+ * Verify if NodeManager's in-memory good local dirs list and good log dirs list
+ * get updated properly when disks(nm-local-dirs and nm-log-dirs) fail. Also
+ * verify if the overall health status of the node gets updated properly when
+ * specified percentage of disks fail.
+ */
+public class TestDiskFailures {
+
+  private static final Log LOG = LogFactory.getLog(TestDiskFailures.class);
+
+  private static final long DISK_HEALTH_CHECK_INTERVAL = 1000;//1 sec
+
+  private static FileContext localFS = null;
+  private static final File testDir = new File("target",
+      TestDiskFailures.class.getName()).getAbsoluteFile();
+  private static final File localFSDirBase = new File(testDir,
+      TestDiskFailures.class.getName() + "-localDir");
+  private static final int numLocalDirs = 4;
+  private static final int numLogDirs = 4;
+
+  private static MiniYARNCluster yarnCluster;
+  LocalDirsHandlerService dirsHandler;
+
+  @BeforeClass
+  public static void setup() throws AccessControlException,
+      FileNotFoundException, UnsupportedFileSystemException, IOException {
+    localFS = FileContext.getLocalFSFileContext();
+    localFS.delete(new Path(localFSDirBase.getAbsolutePath()), true);
+    localFSDirBase.mkdirs();
+    // Do not start cluster here
+  }
+
+  @AfterClass
+  public static void teardown() {
+    if (yarnCluster != null) {
+      yarnCluster.stop();
+      yarnCluster = null;
+    }
+    FileUtil.fullyDelete(localFSDirBase);
+  }
+
+  /**
+   * Make local-dirs fail/inaccessible and verify if NodeManager can
+   * recognize the disk failures properly and can update the list of
+   * local-dirs accordingly with good disks. Also verify the overall
+   * health status of the node.
+   * @throws IOException
+   */
+  @Test
+  public void testLocalDirsFailures() throws IOException {
+    testDirsFailures(true);
+  }
+
+  /**
+   * Make log-dirs fail/inaccessible and verify if NodeManager can
+   * recognize the disk failures properly and can update the list of
+   * log-dirs accordingly with good disks. Also verify the overall health
+   * status of the node.
+   * @throws IOException
+   */  
+  @Test
+  public void testLogDirsFailures() throws IOException {
+    testDirsFailures(false);
+  }
+
+  private void testDirsFailures(boolean localORLogDirs) throws IOException {
+    String dirType = localORLogDirs ? "local" : "log";
+    String dirsProperty = localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS
+                                         : YarnConfiguration.NM_LOG_DIRS;
+
+    Configuration conf = new Configuration();
+    // set disk health check interval to a small value (say 1 sec).
+    conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
+                 DISK_HEALTH_CHECK_INTERVAL);
+
+    // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
+    // log-dirs fail, then the node's health status should become unhealthy.
+    conf.setFloat(YarnConfiguration.NM_MIN_HEALTHY_DISKS_FRACTION, 0.60F);
+
+    if (yarnCluster != null) {
+      yarnCluster.stop();
+      FileUtil.fullyDelete(localFSDirBase);
+      localFSDirBase.mkdirs();
+    }
+    LOG.info("Starting up YARN cluster");
+    yarnCluster = new MiniYARNCluster(TestDiskFailures.class.getName(),
+        1, numLocalDirs, numLogDirs);
+    yarnCluster.init(conf);
+    yarnCluster.start();
+
+    NodeManager nm = yarnCluster.getNodeManager(0);
+    LOG.info("Configured nm-" + dirType + "-dirs="
+             + nm.getConfig().get(dirsProperty));
+    dirsHandler = nm.getNodeHealthChecker().getDiskHandler();
+    List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
+                                       : dirsHandler.getLogDirs();
+    String[] dirs = list.toArray(new String[list.size()]);
+    Assert.assertEquals("Number of nm-" + dirType + "-dirs is wrong.",
+                        numLocalDirs, dirs.length);
+    String expectedDirs = StringUtils.join(",", list);
+    // validate the health of disks initially
+    verifyDisksHealth(localORLogDirs, expectedDirs, true);
+
+    // Make 1 nm-local-dir fail and verify if "the nodemanager can identify
+    // the disk failure(s) and can update the list of good nm-local-dirs.
+    prepareDirToFail(dirs[2]);
+    expectedDirs = dirs[0] + "," + dirs[1] + ","
+                                 + dirs[3];
+    verifyDisksHealth(localORLogDirs, expectedDirs, true);
+
+    // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the
+    // nodemanager can identify the disk failures and can update the list of
+    // good nm-local-dirs/nm-log-dirs and can update the overall health status
+    // of the node to unhealthy".
+    prepareDirToFail(dirs[0]);
+    expectedDirs = dirs[1] + "," + dirs[3];
+    verifyDisksHealth(localORLogDirs, expectedDirs, false);
+
+    // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with
+    // empty list of local-dirs/log-dirs and the overall health status is
+    // unhealthy.
+    prepareDirToFail(dirs[1]);
+    prepareDirToFail(dirs[3]);
+    expectedDirs = "";
+    verifyDisksHealth(localORLogDirs, expectedDirs, false);
+  }
+
+  /**
+   * Wait for the NodeManger to go for the disk-health-check at least once.
+   */
+  private void waitForDiskHealthCheck() {
+    long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
+    long time = lastDisksCheckTime;
+    for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
+      try {
+        Thread.sleep(1000);
+      } catch(InterruptedException e) {
+        LOG.error(
+            "Interrupted while waiting for NodeManager's disk health check.");
+      }
+      time = dirsHandler.getLastDisksCheckTime();
+    }
+  }
+
+  /**
+   * Verify if the NodeManager could identify disk failures.
+   * @param localORLogDirs <em>true</em> represent nm-local-dirs and <em>false
+   *                       </em> means nm-log-dirs
+   * @param expectedDirs expected nm-local-dirs/nm-log-dirs as a string
+   * @param isHealthy <em>true</em> if the overall node should be healthy
+   */
+  private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
+      boolean isHealthy) {
+    // Wait for the NodeManager to identify disk failures.
+    waitForDiskHealthCheck();
+
+    List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
+                                       : dirsHandler.getLogDirs();
+    String seenDirs = StringUtils.join(",", list);
+    LOG.info("ExpectedDirs=" + expectedDirs);
+    LOG.info("SeenDirs=" + seenDirs);
+    Assert.assertTrue("NodeManager could not identify disk failure.",
+                      expectedDirs.equals(seenDirs));
+
+    Assert.assertEquals("Node's health in terms of disks is wrong",
+                        isHealthy, dirsHandler.areDisksHealthy());
+    for (int i = 0; i < 10; i++) {
+      Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext()
+                              .getRMNodes().values().iterator();
+      if (iter.next().getNodeHealthStatus().getIsNodeHealthy() == isHealthy) {
+        break;
+      }
+      // wait for the node health info to go to RM
+      try {
+        Thread.sleep(1000);
+      } catch(InterruptedException e) {
+        LOG.error("Interrupted while waiting for NM->RM heartbeat.");
+      }
+    }
+    Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext()
+                            .getRMNodes().values().iterator();
+    Assert.assertEquals("RM is not updated with the health status of a node",
+        isHealthy, iter.next().getNodeHealthStatus().getIsNodeHealthy());
+  }
+
+  /**
+   * Prepare directory for a failure: Replace the given directory on the
+   * local FileSystem with a regular file with the same name.
+   * This would cause failure of creation of directory in DiskChecker.checkDir()
+   * with the same name.
+   * @param dir the directory to be failed
+   * @throws IOException 
+   */
+  private void prepareDirToFail(String dir) throws IOException {
+    File file = new File(dir);
+    FileUtil.fullyDelete(file);
+    file.createNewFile();
+    LOG.info("Prepared " + dir + " to fail.");
+  }
+}

Modified: hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm?rev=1208131&r1=1208130&r2=1208131&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm Tue Nov 29 23:17:54 2011
@@ -398,6 +398,15 @@ Hadoop MapReduce Next Generation - Clust
 | | | Timeout for health script execution. |
 *-------------------------+-------------------------+------------------------+
 
+    The health checker script is not supposed to give ERROR if only some of the
+    local disks become bad. NodeManager has the ability to periodically check
+    the health of the local disks (specifically checks nodemanager-local-dirs
+    and nodemanager-log-dirs) and after reaching the threshold of number of
+    bad directories based on the value set for the config property
+    yarn.nodemanager.disk-health-checker.min-healthy-disks. The boot disk is
+    either raided or a failure in the boot disk is identified by the health
+    checker script.
+
     * {Slaves file}
       
     Typically you choose one machine in the cluster to act as the NameNode and 
@@ -874,13 +883,6 @@ KVNO Timestamp         Principal
 *-------------------------+-------------------------+------------------------+
 || Parameter              || Value                  || Notes                 |
 *-------------------------+-------------------------+------------------------+
-| <<<yarn.nodemanager.local-dirs>>> | |
-| | Comma-separated list of NodeManager local directories. | |
-| | | Paths to NodeManager local directories. Should be same as the value |
-| | | which was provided to key in <<<conf/yarn-site.xml>>>. This is |
-| | | required to validate paths passed to the setuid executable in order |
-| | to prevent arbitrary paths being passed to it. |
-*-------------------------+-------------------------+------------------------+
 | <<<yarn.nodemanager.linux-container-executor.group>>> | <hadoop> | |
 | | | Unix group of the NodeManager. The group owner of the |
 | | |<container-executor> binary should be this group. Should be same as the |
@@ -888,14 +890,6 @@ KVNO Timestamp         Principal
 | | | required for validating the secure access of the <container-executor> |
 | | | binary. |        
 *-------------------------+-------------------------+------------------------+
-| <<<yarn.nodemanager.log-dirs>>> | |
-| | Comma-separated list of NodeManager log directories. | |
-| | | Paths to NodeManager log directories. Should be same as the value |
-| | | which was provided to key in <<<conf/yarn-site.xml>>>. This is |
-| | | required to set proper permissions on the log files so that they can |
-| | | be written to by the user's containers and read by the NodeManager for |
-| | | <log aggregation>. |
-*-------------------------+-------------------------+------------------------+
 | <<<banned.users>>> | hfds,yarn,mapred,bin | Banned users. |
 *-------------------------+-------------------------+------------------------+
 | <<<min.user.id>>> | 1000 | Prevent other super-users. |