You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by vi...@apache.org on 2014/11/04 01:39:38 UTC
git commit: YARN-1922. Fixed NodeManager to kill process-trees
correctly in the presence of races between the launch and the stop-container
call and when root processes crash. Contributed by Billie Rinaldi.
Repository: hadoop
Updated Branches:
refs/heads/trunk 35d353e0f -> c5a46d4c8
YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence of races between the launch and the stop-container call and when root processes crash. Contributed by Billie Rinaldi.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c5a46d4c
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c5a46d4c
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c5a46d4c
Branch: refs/heads/trunk
Commit: c5a46d4c8ca236ff641a309f256bbbdf4dd56db5
Parents: 35d353e
Author: Vinod Kumar Vavilapalli <vi...@apache.org>
Authored: Mon Nov 3 16:38:55 2014 -0800
Committer: Vinod Kumar Vavilapalli <vi...@apache.org>
Committed: Mon Nov 3 16:38:55 2014 -0800
----------------------------------------------------------------------
hadoop-yarn-project/CHANGES.txt | 4 +
.../launcher/ContainerLaunch.java | 5 +-
.../launcher/TestContainerLaunch.java | 111 +++++++++++++++++++
3 files changed, 117 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/c5a46d4c/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 25d03f4..f7c0dfa 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -852,6 +852,10 @@ Release 2.6.0 - UNRELEASED
YARN-2795. Fixed ResourceManager to not crash loading node-label data from
HDFS in secure mode. (Wangda Tan via vinodkv)
+ YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence
+ of races between the launch and the stop-container call and when root
+ processes crash. (Billie Rinaldi via vinodkv)
+
Release 2.5.1 - 2014-09-05
INCOMPATIBLE CHANGES
http://git-wip-us.apache.org/repos/asf/hadoop/blob/c5a46d4c/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
index 434cb4e..57e3bb9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
@@ -461,9 +461,8 @@ public class ContainerLaunch implements Callable<Integer> {
final int sleepInterval = 100;
// loop waiting for pid file to show up
- // until either the completed flag is set which means something bad
- // happened or our timer expires in which case we admit defeat
- while (!completed.get()) {
+ // until our timer expires in which case we admit defeat
+ while (true) {
processId = ProcessIdFileReader.getProcessId(pidFilePath);
if (processId != null) {
LOG.debug("Got pid " + processId + " for container "
http://git-wip-us.apache.org/repos/asf/hadoop/blob/c5a46d4c/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
index 186788d..001643b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
@@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -969,4 +970,114 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
assertThat(e.getMessage(), containsString(expectedMessage));
}
}
+
+ @Test
+ public void testKillProcessGroup() throws Exception {
+ Assume.assumeTrue(Shell.isSetsidAvailable);
+ containerManager.start();
+
+ // Construct the Container-id
+ ApplicationId appId = ApplicationId.newInstance(2, 2);
+ ApplicationAttemptId appAttemptId =
+ ApplicationAttemptId.newInstance(appId, 1);
+ ContainerId cId = ContainerId.newInstance(appAttemptId, 0);
+ File processStartFile =
+ new File(tmpDir, "pid.txt").getAbsoluteFile();
+ File childProcessStartFile =
+ new File(tmpDir, "child_pid.txt").getAbsoluteFile();
+
+ // setup a script that can handle sigterm gracefully
+ File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
+ PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
+ writer.println("#!/bin/bash\n\n");
+ writer.println("echo \"Running testscript for forked process\"");
+ writer.println("umask 0");
+ writer.println("echo $$ >> " + processStartFile);
+ writer.println("while true;\ndo sleep 1s;\ndone > /dev/null 2>&1 &");
+ writer.println("echo $! >> " + childProcessStartFile);
+ writer.println("while true;\ndo sleep 1s;\ndone");
+ writer.close();
+ FileUtil.setExecutable(scriptFile, true);
+
+ ContainerLaunchContext containerLaunchContext =
+ recordFactory.newRecordInstance(ContainerLaunchContext.class);
+
+ // upload the script file so that the container can run it
+ URL resource_alpha =
+ ConverterUtils.getYarnUrlFromPath(localFS
+ .makeQualified(new Path(scriptFile.getAbsolutePath())));
+ LocalResource rsrc_alpha =
+ recordFactory.newRecordInstance(LocalResource.class);
+ rsrc_alpha.setResource(resource_alpha);
+ rsrc_alpha.setSize(-1);
+ rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
+ rsrc_alpha.setType(LocalResourceType.FILE);
+ rsrc_alpha.setTimestamp(scriptFile.lastModified());
+ String destinationFile = "dest_file.sh";
+ Map<String, LocalResource> localResources =
+ new HashMap<String, LocalResource>();
+ localResources.put(destinationFile, rsrc_alpha);
+ containerLaunchContext.setLocalResources(localResources);
+
+ // set up the rest of the container
+ List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
+ containerLaunchContext.setCommands(commands);
+ Priority priority = Priority.newInstance(10);
+ long createTime = 1234;
+ Token containerToken = createContainerToken(cId, priority, createTime);
+
+ StartContainerRequest scRequest =
+ StartContainerRequest.newInstance(containerLaunchContext,
+ containerToken);
+ List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
+ list.add(scRequest);
+ StartContainersRequest allRequests =
+ StartContainersRequest.newInstance(list);
+ containerManager.startContainers(allRequests);
+
+ int timeoutSecs = 0;
+ while (!processStartFile.exists() && timeoutSecs++ < 20) {
+ Thread.sleep(1000);
+ LOG.info("Waiting for process start-file to be created");
+ }
+ Assert.assertTrue("ProcessStartFile doesn't exist!",
+ processStartFile.exists());
+
+ BufferedReader reader =
+ new BufferedReader(new FileReader(processStartFile));
+ // Get the pid of the process
+ String pid = reader.readLine().trim();
+ // No more lines
+ Assert.assertEquals(null, reader.readLine());
+ reader.close();
+
+ reader =
+ new BufferedReader(new FileReader(childProcessStartFile));
+ // Get the pid of the child process
+ String child = reader.readLine().trim();
+ // No more lines
+ Assert.assertEquals(null, reader.readLine());
+ reader.close();
+
+ LOG.info("Manually killing pid " + pid + ", but not child pid " + child);
+ Shell.execCommand(new String[]{"kill", "-9", pid});
+
+ BaseContainerManagerTest.waitForContainerState(containerManager, cId,
+ ContainerState.COMPLETE);
+
+ Assert.assertFalse("Process is still alive!",
+ DefaultContainerExecutor.containerIsAlive(pid));
+
+ List<ContainerId> containerIds = new ArrayList<ContainerId>();
+ containerIds.add(cId);
+
+ GetContainerStatusesRequest gcsRequest =
+ GetContainerStatusesRequest.newInstance(containerIds);
+
+ ContainerStatus containerStatus =
+ containerManager.getContainerStatuses(gcsRequest)
+ .getContainerStatuses().get(0);
+ Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
+ containerStatus.getExitStatus());
+ }
}