You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2014/11/13 17:12:10 UTC

hadoop git commit: YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du

Repository: hadoop
Updated Branches:
  refs/heads/trunk 177e8090f -> 33ea5ae92


YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/33ea5ae9
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/33ea5ae9
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/33ea5ae9

Branch: refs/heads/trunk
Commit: 33ea5ae92b9dd3abace104903d9a94d17dd75af5
Parents: 177e809
Author: Jason Lowe <jl...@apache.org>
Authored: Thu Nov 13 16:11:04 2014 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Thu Nov 13 16:11:04 2014 +0000

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  4 ++++
 .../server/nodemanager/ContainerExecutor.java   | 21 +++++++-------------
 .../nodemanager/LinuxContainerExecutor.java     |  2 +-
 .../launcher/RecoveredContainerLaunch.java      | 20 ++++++++++++-------
 4 files changed, 25 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/33ea5ae9/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 01dc246..326d33c 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -955,6 +955,10 @@ Release 2.6.0 - 2014-11-15
     YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
     zjshen)
 
+    YARN-2846. Incorrect persist exit code for running containers in
+    reacquireContainer() that interrupted by NodeManager restart. (Junping Du
+    via jlowe)
+
 Release 2.5.2 - 2014-11-10
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/33ea5ae9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
index 8133413..327f882 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
@@ -159,9 +159,10 @@ public abstract class ContainerExecutor implements Configurable {
    * @param containerId The ID of the container to reacquire
    * @return The exit code of the pre-existing container
    * @throws IOException
+   * @throws InterruptedException 
    */
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     Path pidPath = getPidFilePath(containerId);
     if (pidPath == null) {
       LOG.warn(containerId + " is not active, returning terminated error");
@@ -175,13 +176,8 @@ public abstract class ContainerExecutor implements Configurable {
     }
 
     LOG.info("Reacquiring " + containerId + " with pid " + pid);
-    try {
-      while(isContainerProcessAlive(user, pid)) {
-        Thread.sleep(1000);
-      }
-    } catch (InterruptedException e) {
-      throw new IOException("Interrupted while waiting for process " + pid
-          + " to exit", e);
+    while(isContainerProcessAlive(user, pid)) {
+      Thread.sleep(1000);
     }
 
     // wait for exit code file to appear
@@ -194,12 +190,9 @@ public abstract class ContainerExecutor implements Configurable {
         LOG.info(containerId + " was deactivated");
         return ExitCode.TERMINATED.getExitCode();
       }
-      try {
-        Thread.sleep(sleepMsec);
-      } catch (InterruptedException e) {
-        throw new IOException(
-            "Interrupted while waiting for exit code from " + containerId, e);
-      }
+      
+      Thread.sleep(sleepMsec);
+      
       msecLeft -= sleepMsec;
     }
     if (msecLeft < 0) {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/33ea5ae9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
index 60cb058..4db4ef2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
@@ -347,7 +347,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
 
   @Override
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     try {
       return super.reacquireContainer(user, containerId);
     } finally {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/33ea5ae9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
index 446695a..03a39aa 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
@@ -73,6 +73,7 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
         ContainerEventType.CONTAINER_LAUNCHED));
 
+    boolean notInterrupted = true;
     try {
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       if (pidFile != null) {
@@ -85,14 +86,19 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
       }
     } catch (IOException e) {
         LOG.error("Unable to recover container " + containerIdStr, e);
+    } catch (InterruptedException e) {
+      LOG.warn("Interrupted while waiting for exit code from " + containerId);
+      notInterrupted = false;
     } finally {
-      this.completed.set(true);
-      exec.deactivateContainer(containerId);
-      try {
-        getContext().getNMStateStore().storeContainerCompleted(containerId,
-            retCode);
-      } catch (IOException e) {
-        LOG.error("Unable to set exit code for container " + containerId);
+      if (notInterrupted) {
+        this.completed.set(true);
+        exec.deactivateContainer(containerId);
+        try {
+          getContext().getNMStateStore().storeContainerCompleted(containerId,
+              retCode);
+        } catch (IOException e) {
+          LOG.error("Unable to set exit code for container " + containerId);
+        }
       }
     }