You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by vi...@apache.org on 2011/10/27 14:06:09 UTC

svn commit: r1189713 - in /hadoop/common/branches/branch-0.23/hadoop-mapreduce-project: ./ hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/ hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/...

Author: vinodkv
Date: Thu Oct 27 12:06:08 2011
New Revision: 1189713

URL: http://svn.apache.org/viewvc?rev=1189713&view=rev
Log:
MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its containers (process-trees) irrespective of whether the container succeeded, or killed. Contributed by Hitesh Shah.

Modified:
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java
    hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt Thu Oct 27 12:06:08 2011
@@ -1743,6 +1743,10 @@ Release 0.23.0 - Unreleased
     MAPREDUCE-3279. Fixed TestJobHistoryParsing which assumed user name to be
     mapred all the time. (Siddharth Seth via acmurthy)
 
+    MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its
+    containers (process-trees) irrespective of whether the container succeeded,
+    or killed. Contributed by Hitesh Shah.
+
 Release 0.22.0 - Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java Thu Oct 27 12:06:08 2011
@@ -166,8 +166,6 @@ public abstract class RMContainerRequest
           for (ResourceRequest req : reqMap.values()) {
             if (!ask.remove(req)) {
               foundAll = false;
-            }
-            else {
               // if ask already sent to RM, we can try and overwrite it if possible.
               // send a new ask to RM with numContainers
               // specified for the blacklisted host to be 0.
@@ -181,7 +179,7 @@ public abstract class RMContainerRequest
           // we can remove this request
           if (foundAll) {
             remoteRequests.remove(hostName);
-          }     
+          }
         }
         // TODO handling of rack blacklisting
         // Removing from rack should be dependent on no. of failures within the rack 

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java Thu Oct 27 12:06:08 2011
@@ -411,6 +411,20 @@ public class YarnConfiguration extends C
   YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCE_LOCALIZER =
       "security.resourcelocalizer.protocol.acl";
 
+  /** No. of milliseconds to wait between sending a SIGTERM and SIGKILL
+   * to a running container */
+  public static final String NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
+      NM_PREFIX + "sleep-delay-before-sigkill.ms";
+  public static final long DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
+      250;
+
+  /** Max time to wait for a process to come up when trying to cleanup
+   * container resources */
+  public static final String NM_PROCESS_KILL_WAIT_MS =
+      NM_PREFIX + "process-kill-wait.ms";
+  public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
+      2000;
+
   public YarnConfiguration() {
     super();
   }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml Thu Oct 27 12:06:08 2011
@@ -366,6 +366,18 @@
     <!-- <value>mapreduce.shuffle</value> -->
   </property>
 
+  <property>
+    <description>No. of ms to wait between sending a SIGTERM and SIGKILL to a container</description>
+    <name>yarn.nodemanager.sleep-delay-before-sigkill.ms</name>
+    <value>250</value>
+  </property>
+
+  <property>
+    <description>Max time to wait for a process to come up when trying to cleanup a container</description>
+    <name>yarn.nodemanager.process-kill-wait.ms</name>
+    <value>2000</value>
+  </property>
+
   <!--Map Reduce configuration-->
   <property>
     <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c Thu Oct 27 12:06:08 2011
@@ -45,6 +45,9 @@ FILE* ERRORFILE = NULL;
 static uid_t nm_uid = -1;
 static gid_t nm_gid = -1;
 
+char *concatenate(char *concat_pattern, char *return_path_name,
+   int numArgs, ...);
+
 void set_nm_uid(uid_t user, gid_t group) {
   nm_uid = user;
   nm_gid = group;
@@ -148,6 +151,60 @@ static int change_effective_user(uid_t u
 }
 
 /**
+ * Write the pid of the current process into the pid file.
+ * pid_file: Path to pid file where pid needs to be written to
+ */
+static int write_pid_to_file_as_nm(const char* pid_file, pid_t pid) {
+  uid_t user = geteuid();
+  gid_t group = getegid();
+  if (change_effective_user(nm_uid, nm_gid) != 0) {
+    return -1;
+  }
+
+  char *temp_pid_file = concatenate("%s.tmp", "pid_file_path", 1, pid_file);
+
+  // create with 700
+  int pid_fd = open(temp_pid_file, O_WRONLY|O_CREAT|O_EXCL, S_IRWXU);
+  if (pid_fd == -1) {
+    fprintf(LOGFILE, "Can't open file %s as node manager - %s\n", temp_pid_file,
+           strerror(errno));
+    free(temp_pid_file);
+    return -1;
+  }
+
+  // write pid to temp file
+  char pid_buf[21];
+  snprintf(pid_buf, 21, "%d", pid);
+  ssize_t written = write(pid_fd, pid_buf, strlen(pid_buf));
+  close(pid_fd);
+  if (written == -1) {
+    fprintf(LOGFILE, "Failed to write pid to file %s as node manager - %s\n",
+       temp_pid_file, strerror(errno));
+    free(temp_pid_file);
+    return -1;
+  }
+
+  // rename temp file to actual pid file
+  // use rename as atomic
+  if (rename(temp_pid_file, pid_file)) {
+    fprintf(LOGFILE, "Can't move pid file from %s to %s as node manager - %s\n",
+        temp_pid_file, pid_file, strerror(errno));
+    unlink(temp_pid_file);
+    free(temp_pid_file);
+    return -1;
+  }
+
+  // Revert back to the calling user.
+  if (change_effective_user(user, group)) {
+	free(temp_pid_file);
+    return -1;
+  }
+
+  free(temp_pid_file);
+  return 0;
+}
+
+/**
  * Change the real and effective user and group to abandon the super user
  * priviledges.
  */
@@ -749,7 +806,8 @@ int initialize_app(const char *user, con
 
 int launch_container_as_user(const char *user, const char *app_id, 
                      const char *container_id, const char *work_dir,
-                     const char *script_name, const char *cred_file) {
+                     const char *script_name, const char *cred_file,
+                     const char* pid_file) {
   int exit_code = -1;
   char *script_file_dest = NULL;
   char *cred_file_dest = NULL;
@@ -776,6 +834,20 @@ int launch_container_as_user(const char 
     goto cleanup;
   }
 
+  // setsid 
+  pid_t pid = setsid();
+  if (pid == -1) {
+    exit_code = SETSID_OPER_FAILED;
+    goto cleanup;
+  }
+
+  // write pid to pidfile
+  if (pid_file == NULL
+      || write_pid_to_file_as_nm(pid_file, pid) != 0) {
+    exit_code = WRITE_PIDFILE_FAILED;
+    goto cleanup;
+  }  
+
   // give up root privs
   if (change_user(user_detail->pw_uid, user_detail->pw_gid) != 0) {
     exit_code = SETUID_OPER_FAILED;
@@ -1031,3 +1103,5 @@ int delete_as_user(const char *user,
   }
   return ret;
 }
+
+

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h Thu Oct 27 12:06:08 2011
@@ -51,7 +51,9 @@ enum errorcodes {
   UNABLE_TO_BUILD_PATH, //21
   INVALID_CONTAINER_EXEC_PERMISSIONS, //22
   // PREPARE_JOB_LOGS_FAILED (NOT USED) 23
-  INVALID_CONFIG_FILE =  24
+  INVALID_CONFIG_FILE =  24,
+  SETSID_OPER_FAILED = 25,
+  WRITE_PIDFILE_FAILED = 26
 };
 
 #define NM_GROUP_KEY "yarn.nodemanager.linux-container-executor.group"
@@ -106,11 +108,13 @@ int initialize_app(const char *user, con
  * @param script_name the name of the script to be run to launch the container.
  * @param cred_file the credentials file that needs to be compied to the
  * working directory.
+ * @param pid_file file where pid of process should be written to
  * @return -1 or errorcode enum value on error (should never return on success).
  */
 int launch_container_as_user(const char * user, const char *app_id,
                      const char *container_id, const char *work_dir,
-                     const char *script_name, const char *cred_file);
+                     const char *script_name, const char *cred_file,
+                     const char *pid_file);
 
 /**
  * Function used to signal a container launched by the user.

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c Thu Oct 27 12:06:08 2011
@@ -66,6 +66,7 @@ int main(int argc, char **argv) {
   const char * cred_file = NULL;
   const char * script_file = NULL;
   const char * current_dir = NULL;
+  const char * pid_file = NULL;
 
   int exit_code = 0;
 
@@ -141,7 +142,7 @@ int main(int argc, char **argv) {
                                argv + optind);
     break;
   case LAUNCH_CONTAINER:
-    if (argc < 8) {
+    if (argc < 9) {
       fprintf(ERRORFILE, "Too few arguments (%d vs 8) for launch container\n",
 	      argc);
       fflush(ERRORFILE);
@@ -152,8 +153,9 @@ int main(int argc, char **argv) {
     current_dir = argv[optind++];
     script_file = argv[optind++];
     cred_file = argv[optind++];
+    pid_file = argv[optind++];
     exit_code = launch_container_as_user(user_detail->pw_name, app_id, container_id,
-                                 current_dir, script_file, cred_file);
+                                 current_dir, script_file, cred_file, pid_file);
     break;
   case SIGNAL_CONTAINER:
     if (argc < 5) {

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c Thu Oct 27 12:06:08 2011
@@ -590,6 +590,7 @@ void test_run_container() {
   fflush(stderr);
   char* container_dir = get_container_work_directory(TEST_ROOT "/local-1", 
 					      username, "app_4", "container_1");
+  const char * pid_file = TEST_ROOT "/pid.txt";
   pid_t child = fork();
   if (child == -1) {
     printf("FAIL: failed to fork process for init_app - %s\n", 
@@ -597,7 +598,7 @@ void test_run_container() {
     exit(1);
   } else if (child == 0) {
     if (launch_container_as_user(username, "app_4", "container_1", 
-                         container_dir, script_name, TEST_ROOT "creds.txt") != 0) {
+                         container_dir, script_name, TEST_ROOT "/creds.txt", pid_file) != 0) {
       printf("FAIL: failed in child\n");
       exit(42);
     }
@@ -631,6 +632,32 @@ void test_run_container() {
     exit(1);
   }
   free(container_dir);
+
+  if(access(pid_file, R_OK) != 0) {
+    printf("FAIL: failed to create pid file %s\n", pid_file);
+    exit(1);
+  }
+  int pidfd = open(pid_file, O_RDONLY);
+  if (pidfd == -1) {
+    printf("FAIL: failed to open pid file %s - %s\n", pid_file, strerror(errno));
+    exit(1);
+  }
+
+  char pidBuf[100];
+  ssize_t bytes = read(pidfd, pidBuf, 100);
+  if (bytes == -1) {
+    printf("FAIL: failed to read from pid file %s - %s\n", pid_file, strerror(errno));
+    exit(1);
+  }
+
+  pid_t mypid = child;
+  char myPidBuf[33];
+  snprintf(myPidBuf, 33, "%d", mypid);
+  if (strncmp(pidBuf, myPidBuf, strlen(myPidBuf)) != 0) {
+    printf("FAIL: failed to find matching pid in pid file\n");
+    printf("FAIL: Expected pid %d : Got %.*s", mypid, (int)bytes, pidBuf);
+    exit(1);
+  }
 }
 
 int main(int argc, char **argv) {

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -19,12 +19,13 @@
 package org.apache.hadoop.yarn.server.nodemanager;
 
 import java.io.IOException;
-import java.lang.reflect.Field;
-
 import java.net.InetSocketAddress;
 import java.util.List;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -35,6 +36,7 @@ import org.apache.hadoop.fs.permission.F
 import org.apache.hadoop.util.Shell.ShellCommandExecutor;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
 
 public abstract class ContainerExecutor implements Configurable {
 
@@ -43,8 +45,12 @@ public abstract class ContainerExecutor 
     FsPermission.createImmutable((short) 0700);
 
   private Configuration conf;
-  protected ConcurrentMap<ContainerId, ShellCommandExecutor> launchCommandObjs =
-      new ConcurrentHashMap<ContainerId, ShellCommandExecutor>();
+  private ConcurrentMap<ContainerId, Path> pidFiles =
+      new ConcurrentHashMap<ContainerId, Path>();
+
+  private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+  private final ReadLock readLock = lock.readLock();
+  private final WriteLock writeLock = lock.writeLock();
 
   @Override
   public void setConf(Configuration conf) {
@@ -102,7 +108,8 @@ public abstract class ContainerExecutor 
       throws IOException, InterruptedException;
 
   public enum ExitCode {
-    KILLED(137);
+    FORCE_KILLED(137),
+    TERMINATED(143);
     private final int code;
 
     private ExitCode(int exitCode) {
@@ -150,6 +157,66 @@ public abstract class ContainerExecutor 
   }
 
   /**
+   * Get the pidFile of the container.
+   * @param containerId
+   * @return the path of the pid-file for the given containerId.
+   */
+  protected Path getPidFilePath(ContainerId containerId) {
+    try {
+      readLock.lock();
+      return (this.pidFiles.get(containerId));
+    } finally {
+      readLock.unlock();
+    }
+  }
+
+  /**
+   * Is the container still active?
+   * @param containerId
+   * @return true if the container is active else false.
+   */
+  protected boolean isContainerActive(ContainerId containerId) {
+    try {
+      readLock.lock();
+      return (this.pidFiles.containsKey(containerId));
+    } finally {
+      readLock.unlock();
+    }
+  }
+
+  /**
+   * Mark the container as active
+   * 
+   * @param containerId
+   *          the ContainerId
+   * @param pidFilePath
+   *          Path where the executor should write the pid of the launched
+   *          process
+   */
+  public void activateContainer(ContainerId containerId, Path pidFilePath) {
+    try {
+      writeLock.lock();
+      this.pidFiles.put(containerId, pidFilePath);
+    } finally {
+      writeLock.unlock();
+    }
+  }
+
+  /**
+   * Mark the container as inactive.
+   * Done iff the container is still active. Else treat it as
+   * a no-op
+   */
+  public void deactivateContainer(ContainerId containerId) {
+    try {
+      writeLock.lock();
+      this.pidFiles.remove(containerId);
+    } finally {
+      writeLock.unlock();
+    }
+  }
+
+  /**
    * Get the process-identifier for the container
    * 
    * @param containerID
@@ -158,28 +225,15 @@ public abstract class ContainerExecutor 
    */
   public String getProcessId(ContainerId containerID) {
     String pid = null;
-    ShellCommandExecutor shExec = launchCommandObjs.get(containerID);
-    if (shExec == null) {
+    Path pidFile = pidFiles.get(containerID);
+    if (pidFile == null) {
       // This container isn't even launched yet.
       return pid;
     }
-    Process proc = shExec.getProcess();
-    if (proc == null) {
-      // This happens if the command is not yet started
-      return pid;
-    }
     try {
-      Field pidField = proc.getClass().getDeclaredField("pid");
-      pidField.setAccessible(true);
-      pid = ((Integer) pidField.get(proc)).toString();
-    } catch (SecurityException e) {
-      // SecurityManager not expected with yarn. Ignore.
-    } catch (NoSuchFieldException e) {
-      // Yarn only on UNIX for now. Ignore.
-    } catch (IllegalArgumentException e) {
-      ;
-    } catch (IllegalAccessException e) {
-      ;
+      pid = ProcessIdFileReader.getProcessId(pidFile);
+    } catch (IOException e) {
+      LOG.error("Got exception reading pid from pid-file " + pidFile, e);
     }
     return pid;
   }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -18,10 +18,16 @@
 
 package org.apache.hadoop.yarn.server.nodemanager;
 
+import static org.apache.hadoop.fs.CreateFlag.CREATE;
+import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
+
+import java.io.DataOutputStream;
 import java.io.File;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.net.InetSocketAddress;
 import java.util.Arrays;
+import java.util.EnumSet;
 import java.util.List;
 
 import org.apache.commons.logging.Log;
@@ -48,6 +54,9 @@ public class DefaultContainerExecutor ex
 
   private final FileContext lfs;
 
+  private static final String WRAPPER_LAUNCH_SCRIPT = 
+      "default_container_executor.sh";
+
   public DefaultContainerExecutor() {
     try {
       this.lfs = FileContext.getLocalFSFileContext();
@@ -100,8 +109,9 @@ public class DefaultContainerExecutor ex
         ConverterUtils.toString(
             container.getContainerID().getApplicationAttemptId().
                 getApplicationId());
-    String[] sLocalDirs =
-        getConf().getStrings(YarnConfiguration.NM_LOCAL_DIRS, YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
+    String[] sLocalDirs = getConf().getStrings(
+        YarnConfiguration.NM_LOCAL_DIRS,
+        YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
     for (String sLocalDir : sLocalDirs) {
       Path usersdir = new Path(sLocalDir, ContainerLocalizer.USERCACHE);
       Path userdir = new Path(usersdir, userName);
@@ -124,21 +134,47 @@ public class DefaultContainerExecutor ex
       new Path(containerWorkDir, ContainerLaunch.FINAL_CONTAINER_TOKENS_FILE);
     lfs.util().copy(nmPrivateTokensPath, tokenDst);
 
+    // Create new local launch wrapper script
+    Path wrapperScriptDst = new Path(containerWorkDir, WRAPPER_LAUNCH_SCRIPT);
+    DataOutputStream wrapperScriptOutStream =
+        lfs.create(wrapperScriptDst,
+            EnumSet.of(CREATE, OVERWRITE));
+
+    Path pidFile = getPidFilePath(containerId);
+    if (pidFile != null) {
+      writeLocalWrapperScript(wrapperScriptOutStream, launchDst.toUri()
+          .getPath().toString(), pidFile.toString());
+    } else {
+      LOG.info("Container " + containerIdStr
+          + " was marked as inactive. Returning terminated error");
+      return ExitCode.TERMINATED.getExitCode();
+    }
+
     // create log dir under app
     // fork script
     ShellCommandExecutor shExec = null;
     try {
       lfs.setPermission(launchDst,
           ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
-      String[] command = 
-          new String[] { "bash", "-c", launchDst.toUri().getPath().toString() };
+      lfs.setPermission(wrapperScriptDst,
+          ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
+
+      // Setup command to run
+      String[] command = {"bash", "-c",
+          wrapperScriptDst.toUri().getPath().toString()};
       LOG.info("launchContainer: " + Arrays.toString(command));
       shExec = new ShellCommandExecutor(
           command,
-          new File(containerWorkDir.toUri().getPath()), 
+          new File(containerWorkDir.toUri().getPath()),
           container.getLaunchContext().getEnvironment());      // sanitized env
-      launchCommandObjs.put(containerId, shExec);
-      shExec.execute();
+      if (isContainerActive(containerId)) {
+        shExec.execute();
+      }
+      else {
+        LOG.info("Container " + containerIdStr +
+            " was marked as inactive. Returning terminated error");
+        return ExitCode.TERMINATED.getExitCode();
+      }
     } catch (IOException e) {
       if (null == shExec) {
         return -1;
@@ -151,17 +187,44 @@ public class DefaultContainerExecutor ex
           message));
       return exitCode;
     } finally {
-      launchCommandObjs.remove(containerId);
+      ; //
     }
     return 0;
   }
 
+  private void writeLocalWrapperScript(DataOutputStream out,
+      String launchScriptDst, String pidFilePath) throws IOException {
+    // We need to do a move as writing to a file is not atomic
+    // Process reading a file being written to may get garbled data
+    // hence write pid to tmp file first followed by a mv
+    StringBuilder sb = new StringBuilder("#!/bin/bash\n\n");
+    sb.append("echo $$ > " + pidFilePath + ".tmp\n");
+    sb.append("/bin/mv -f " + pidFilePath + ".tmp " + pidFilePath + "\n");
+    sb.append(ContainerExecutor.isSetsidAvailable? "exec setsid" : "exec");
+    sb.append(" /bin/bash ");
+    sb.append("-c ");
+    sb.append("\"");
+    sb.append(launchScriptDst);
+    sb.append("\"\n");
+    PrintStream pout = null;
+    try {
+      pout = new PrintStream(out);
+      pout.append(sb);
+    } finally {
+      if (out != null) {
+        out.close();
+      }
+    }
+  }
+
   @Override
   public boolean signalContainer(String user, String pid, Signal signal)
       throws IOException {
     final String sigpid = ContainerExecutor.isSetsidAvailable
         ? "-" + pid
         : pid;
+    LOG.debug("Sending signal " + signal.getValue() + " to pid " + sigpid
+        + " as user " + user);
     try {
       sendSignal(sigpid, Signal.NULL);
     } catch (ExitCodeException e) {
@@ -189,8 +252,8 @@ public class DefaultContainerExecutor ex
    */
   protected void sendSignal(String pid, Signal signal) throws IOException {
     ShellCommandExecutor shexec = null;
-      String[] arg = { "kill", "-" + signal.getValue(), pid };
-      shexec = new ShellCommandExecutor(arg);
+    String[] arg = { "kill", "-" + signal.getValue(), pid };
+    shexec = new ShellCommandExecutor(arg);
     shexec.execute();
   }
 

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -155,36 +155,45 @@ public class LinuxContainerExecutor exte
 
     ContainerId containerId = container.getContainerID();
     String containerIdStr = ConverterUtils.toString(containerId);
-    List<String> command = new ArrayList<String>(
-      Arrays.asList(containerExecutorExe, 
-                    user, 
-                    Integer.toString(Commands.LAUNCH_CONTAINER.getValue()),
-                    appId,
-                    containerIdStr,
-                    containerWorkDir.toString(),
-                    nmPrivateCotainerScriptPath.toUri().getPath().toString(),
-                    nmPrivateTokensPath.toUri().getPath().toString()));
-    String[] commandArray = command.toArray(new String[command.size()]);
-    ShellCommandExecutor shExec = 
-        new ShellCommandExecutor(
-            commandArray,
-            null,                                              // NM's cwd
-            container.getLaunchContext().getEnvironment());    // sanitized env
-    launchCommandObjs.put(containerId, shExec);
-    // DEBUG
-    LOG.info("launchContainer: " + Arrays.toString(commandArray));
+
+    ShellCommandExecutor shExec = null;
+
     try {
-      shExec.execute();
-      if (LOG.isDebugEnabled()) {
-        logOutput(shExec.getOutput());
+      Path pidFilePath = getPidFilePath(containerId);
+      if (pidFilePath != null) {
+        List<String> command = new ArrayList<String>(Arrays.asList(
+            containerExecutorExe, user, Integer
+                .toString(Commands.LAUNCH_CONTAINER.getValue()), appId,
+            containerIdStr, containerWorkDir.toString(),
+            nmPrivateCotainerScriptPath.toUri().getPath().toString(),
+            nmPrivateTokensPath.toUri().getPath().toString(), pidFilePath
+                .toString()));
+        String[] commandArray = command.toArray(new String[command.size()]);
+        shExec = new ShellCommandExecutor(commandArray, null, // NM's cwd
+            container.getLaunchContext().getEnvironment()); // sanitized env
+        // DEBUG
+        LOG.info("launchContainer: " + Arrays.toString(commandArray));
+        shExec.execute();
+        if (LOG.isDebugEnabled()) {
+          logOutput(shExec.getOutput());
+        }
+      } else {
+        LOG.info("Container was marked as inactive. Returning terminated error");
+        return ExitCode.TERMINATED.getExitCode();
       }
     } catch (ExitCodeException e) {
+
+      if (null == shExec) {
+        return -1;
+      }
+
       int exitCode = shExec.getExitCode();
       LOG.warn("Exit code from container is : " + exitCode);
       // 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
       // terminated/killed forcefully. In all other cases, log the
       // container-executor's output
-      if (exitCode != 143 && exitCode != 137) {
+      if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
+          && exitCode != ExitCode.TERMINATED.getExitCode()) {
         LOG.warn("Exception from container-launch : ", e);
         logOutput(shExec.getOutput());
         String diagnostics = "Exception from container-launch: \n"
@@ -197,7 +206,7 @@ public class LinuxContainerExecutor exte
       }
       return exitCode;
     } finally {
-      launchCommandObjs.remove(containerId);
+      ; //
     }
     if (LOG.isDebugEnabled()) {
       LOG.debug("Output from LinuxContainerExecutor's launchContainer follows:");

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java Thu Oct 27 12:06:08 2011
@@ -168,8 +168,6 @@ public class ContainerImpl implements Co
     .addTransition(ContainerState.LOCALIZED, ContainerState.LOCALIZED,
        ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
        UPDATE_DIAGNOSTICS_TRANSITION)
-       // TODO race: Can lead to a CONTAINER_LAUNCHED event at state KILLING, 
-       // and a container which will never be killed by the NM.
     .addTransition(ContainerState.LOCALIZED, ContainerState.KILLING,
         ContainerEventType.KILL_CONTAINER, new KillTransition())
 
@@ -239,6 +237,13 @@ public class ContainerImpl implements Co
             ContainerState.DONE,
             ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
             CONTAINER_DONE_TRANSITION)
+    // Handle a launched container during killing stage is a no-op
+    // as cleanup container is always handled after launch container event
+    // in the container launcher
+    .addTransition(ContainerState.KILLING,
+        ContainerState.KILLING,
+        ContainerEventType.CONTAINER_LAUNCHED,
+        new ContainerTransition())
 
     // From CONTAINER_CLEANEDUP_AFTER_KILL State.
     .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java Thu Oct 27 12:06:08 2011
@@ -31,6 +31,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -44,11 +45,14 @@ import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.ApplicationConstants;
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
+import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.Dispatcher;
 import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.DelayedProcessKiller;
 import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
@@ -56,8 +60,10 @@ import org.apache.hadoop.yarn.server.nod
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
-import org.apache.hadoop.yarn.util.ConverterUtils;
+import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
 import org.apache.hadoop.yarn.util.Apps;
+import org.apache.hadoop.yarn.util.ConverterUtils;
+
 public class ContainerLaunch implements Callable<Integer> {
 
   private static final Log LOG = LogFactory.getLog(ContainerLaunch.class);
@@ -65,12 +71,22 @@ public class ContainerLaunch implements 
   public static final String CONTAINER_SCRIPT = "launch_container.sh";
   public static final String FINAL_CONTAINER_TOKENS_FILE = "container_tokens";
 
+  private static final String PID_FILE_NAME_FMT = "%s.pid";
+
   private final Dispatcher dispatcher;
   private final ContainerExecutor exec;
   private final Application app;
   private final Container container;
   private final Configuration conf;
   private final LocalDirAllocator logDirsSelector;
+  
+  private volatile AtomicBoolean shouldLaunchContainer = new AtomicBoolean(false);
+  private volatile AtomicBoolean completed = new AtomicBoolean(false);
+
+  private long sleepDelayBeforeSigKill = 250;
+  private long maxKillWaitTime = 2000;
+
+  private Path pidFilePath = null;
 
   public ContainerLaunch(Configuration configuration, Dispatcher dispatcher,
       ContainerExecutor exec, Application app, Container container) {
@@ -80,6 +96,12 @@ public class ContainerLaunch implements 
     this.container = container;
     this.dispatcher = dispatcher;
     this.logDirsSelector = new LocalDirAllocator(YarnConfiguration.NM_LOG_DIRS);
+    this.sleepDelayBeforeSigKill =
+        conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
+            YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS);
+    this.maxKillWaitTime =
+        conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS,
+            YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS);
   }
 
   @Override
@@ -87,7 +109,8 @@ public class ContainerLaunch implements 
   public Integer call() {
     final ContainerLaunchContext launchContext = container.getLaunchContext();
     final Map<Path,String> localResources = container.getLocalizedResources();
-    String containerIdStr = ConverterUtils.toString(container.getContainerID());
+    ContainerId containerID = container.getContainerID();
+    String containerIdStr = ConverterUtils.toString(containerID);
     final String user = launchContext.getUser();
     final List<String> command = launchContext.getCommands();
     int ret = -1;
@@ -145,6 +168,17 @@ public class ContainerLaunch implements 
               + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr
               + Path.SEPARATOR + containerIdStr,
               LocalDirAllocator.SIZE_UNKNOWN, this.conf, false);
+
+      String pidFileSuffix = String.format(ContainerLaunch.PID_FILE_NAME_FMT,
+          containerIdStr);
+
+      // pid file should be in nm private dir so that it is not 
+      // accessible by users
+      pidFilePath = lDirAllocator.getLocalPathForWrite(
+          ResourceLocalizationService.NM_PRIVATE_DIR + Path.SEPARATOR 
+          + pidFileSuffix,
+          this.conf);
+
       try {
         // /////////// Write out the container-script in the nmPrivate space.
         String[] localDirs =
@@ -189,21 +223,36 @@ public class ContainerLaunch implements 
       // LaunchContainer is a blocking call. We are here almost means the
       // container is launched, so send out the event.
       dispatcher.getEventHandler().handle(new ContainerEvent(
-            container.getContainerID(),
+            containerID,
             ContainerEventType.CONTAINER_LAUNCHED));
 
-      ret =
-          exec.launchContainer(container, nmPrivateContainerScriptPath,
-              nmPrivateTokensPath, user, appIdStr, containerWorkDir);
+      // Check if the container is signalled to be killed.
+      if (!shouldLaunchContainer.compareAndSet(false, true)) {
+        LOG.info("Container " + containerIdStr + " not launched as "
+            + "cleanup already called");
+        ret = ExitCode.TERMINATED.getExitCode();
+      }
+      else {
+        exec.activateContainer(containerID, pidFilePath);
+        ret =
+            exec.launchContainer(container, nmPrivateContainerScriptPath,
+                nmPrivateTokensPath, user, appIdStr, containerWorkDir);
+      }
     } catch (Throwable e) {
       LOG.warn("Failed to launch container", e);
       dispatcher.getEventHandler().handle(new ContainerExitEvent(
             launchContext.getContainerId(),
             ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret));
       return ret;
+    } finally {
+      completed.set(true);
+      exec.deactivateContainer(containerID);
     }
 
-    if (ret == ExitCode.KILLED.getExitCode()) {
+    LOG.debug("Container " + containerIdStr + " completed with exit code "
+        + ret);
+    if (ret == ExitCode.FORCE_KILLED.getExitCode()
+        || ret == ExitCode.TERMINATED.getExitCode()) {
       // If the process was killed, Send container_cleanedup_after_kill and
       // just break out of this method.
       dispatcher.getEventHandler().handle(
@@ -226,6 +275,114 @@ public class ContainerLaunch implements 
             ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
     return 0;
   }
+  
+  /**
+   * Cleanup the container.
+   * Cancels the launch if launch has not started yet or signals
+   * the executor to not execute the process if not already done so.
+   * Also, sends a SIGTERM followed by a SIGKILL to the process if
+   * the process id is available.
+   * @throws IOException
+   */
+  public void cleanupContainer() throws IOException {
+    ContainerId containerId = container.getContainerID();
+    String containerIdStr = ConverterUtils.toString(containerId);
+    LOG.info("Cleaning up container " + containerIdStr);
+
+    // launch flag will be set to true if process already launched
+    boolean alreadyLaunched = !shouldLaunchContainer.compareAndSet(false, true);
+    if (!alreadyLaunched) {
+      LOG.info("Container " + containerIdStr + " not launched."
+          + " No cleanup needed to be done");
+      return;
+    }
+
+    LOG.debug("Marking container " + containerIdStr + " as inactive");
+    // this should ensure that if the container process has not launched 
+    // by this time, it will never be launched
+    exec.deactivateContainer(containerId);
+
+    LOG.debug("Getting pid for container " + containerIdStr + " to kill"
+        + " from pid file " 
+        + (pidFilePath != null ? pidFilePath.toString() : "null"));
+
+    // however the container process may have already started
+    try {
+
+      // get process id from pid file if available
+      // else if shell is still active, get it from the shell
+      String processId = null;
+      if (pidFilePath != null) {
+        processId = getContainerPid(pidFilePath);
+      }
+
+      // kill process
+      if (processId != null) {
+        String user = container.getLaunchContext().getUser();
+        LOG.debug("Sending signal to pid " + processId
+            + " as user " + user
+            + " for container " + containerIdStr);
+        if (sleepDelayBeforeSigKill > 0) {
+          boolean result = exec.signalContainer(user,
+              processId, Signal.TERM);
+          LOG.debug("Sent signal to pid " + processId
+              + " as user " + user
+              + " for container " + containerIdStr
+              + ", result=" + (result? "success" : "failed"));
+          new DelayedProcessKiller(user,
+              processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
+        }
+      }
+    } catch (Exception e) {
+      LOG.warn("Got error when trying to cleanup container " + containerIdStr
+          + ", error=" + e.getMessage());
+    } finally {
+      // cleanup pid file if present
+      if (pidFilePath != null) {
+        FileContext lfs = FileContext.getLocalFSFileContext();
+        lfs.delete(pidFilePath, false);
+      }
+    }
+  }
+
+  /**
+   * Loop through for a time-bounded interval waiting to
+   * read the process id from a file generated by a running process.
+   * @param pidFilePath File from which to read the process id
+   * @return Process ID
+   * @throws Exception
+   */
+  private String getContainerPid(Path pidFilePath) throws Exception {
+    String containerIdStr = 
+        ConverterUtils.toString(container.getContainerID());
+    String processId = null;
+    LOG.debug("Accessing pid for container " + containerIdStr
+        + " from pid file " + pidFilePath);
+    int sleepCounter = 0;
+    final int sleepInterval = 100;
+
+    // loop waiting for pid file to show up 
+    // until either the completed flag is set which means something bad 
+    // happened or our timer expires in which case we admit defeat
+    while (!completed.get()) {
+      processId = ProcessIdFileReader.getProcessId(pidFilePath);
+      if (processId != null) {
+        LOG.debug("Got pid " + processId + " for container "
+            + containerIdStr);
+        break;
+      }
+      else if ((sleepCounter*sleepInterval) > maxKillWaitTime) {
+        LOG.info("Could not get pid for " + containerIdStr
+        		+ ". Waited for " + maxKillWaitTime + " ms.");
+        break;
+      }
+      else {
+        ++sleepCounter;
+        Thread.sleep(sleepInterval);
+      }
+    }
+    return processId;
+  }
 
   private String getContainerPrivateDir(String appIdStr, String containerIdStr) {
     return getAppPrivateDir(appIdStr) + Path.SEPARATOR + containerIdStr
@@ -287,7 +444,7 @@ public class ContainerLaunch implements 
     public String toString() {
       return sb.toString();
     }
-  
+
   }
 
   private static void putEnvIfNotNull(
@@ -374,9 +531,9 @@ public class ContainerLaunch implements 
         sb.symlink(link.getKey(), link.getValue());
       }
     }
+
     ArrayList<String> cmd = new ArrayList<String>(2 * command.size() + 5);
-    cmd.add(ContainerExecutor.isSetsidAvailable ? "exec setsid " : "exec ");
-    cmd.add("/bin/bash ");
+    cmd.add("exec /bin/bash ");
     cmd.add("-c ");
     cmd.add("\"");
     for (String cs : command) {

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java Thu Oct 27 12:06:08 2011
@@ -26,15 +26,17 @@ import java.util.concurrent.ExecutorServ
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.yarn.YarnException;
 import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.Dispatcher;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
-import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -52,6 +54,8 @@ import com.google.common.util.concurrent
 public class ContainersLauncher extends AbstractService
     implements EventHandler<ContainersLauncherEvent> {
 
+  private static final Log LOG = LogFactory.getLog(ContainersLauncher.class);
+
   private final Context context;
   private final ContainerExecutor exec;
   private final Dispatcher dispatcher;
@@ -64,13 +68,14 @@ public class ContainersLauncher extends 
     Collections.synchronizedMap(new HashMap<ContainerId,RunningContainer>());
 
   private static final class RunningContainer {
-    public RunningContainer(String string, Future<Integer> submit) {
-      this.user = string;
+    public RunningContainer(Future<Integer> submit,
+        ContainerLaunch launcher) {
       this.runningcontainer = submit;
+      this.launcher = launcher;
     }
 
-    String user;
     Future<Integer> runningcontainer;
+    ContainerLaunch launcher;
   }
 
 
@@ -104,7 +109,6 @@ public class ContainersLauncher extends 
     // TODO: ContainersLauncher launches containers one by one!!
     Container container = event.getContainer();
     ContainerId containerId = container.getContainerID();
-    String userName = container.getUser();
     switch (event.getType()) {
       case LAUNCH_CONTAINER:
         Application app =
@@ -114,33 +118,26 @@ public class ContainersLauncher extends 
           new ContainerLaunch(getConfig(), dispatcher, exec, app,
               event.getContainer());
         running.put(containerId,
-            new RunningContainer(userName,
-                containerLauncher.submit(launch)));
+            new RunningContainer(containerLauncher.submit(launch), 
+                launch));
         break;
       case CLEANUP_CONTAINER:
         RunningContainer rContainerDatum = running.remove(containerId);
         Future<Integer> rContainer = rContainerDatum.runningcontainer;
-        if (rContainer != null) {
-  
-          if (rContainer.isDone()) {
-            // The future is already done by this time.
-            break;
-          }
-  
-          // Cancel the future so that it won't be launched if it isn't already.
+        if (rContainer != null 
+            && !rContainer.isDone()) {
+          // Cancel the future so that it won't be launched 
+          // if it isn't already.
           rContainer.cancel(false);
-  
-          // Kill the container
-          String processId = exec.getProcessId(containerId);
-          if (processId != null) {
-            try {
-              exec.signalContainer(rContainerDatum.user,
-                  processId, Signal.KILL);
-            } catch (IOException e) {
-              // TODO Auto-generated catch block
-              e.printStackTrace();
-            }
-          }
+        }
+
+        // Cleanup a container whether it is running/killed/completed, so that
+        // no sub-processes are alive.
+        try {
+          rContainerDatum.launcher.cleanupContainer();
+        } catch (IOException e) {
+          LOG.warn("Got exception while cleaning container " + containerId
+              + ". Ignoring.");
         }
         break;
     }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -185,7 +185,9 @@ public class TestLinuxContainerExecutor 
     Path scriptPath = new Path(script);
     Path tokensPath = new Path("/dev/null");
     Path workDir = new Path(workSpace.getAbsolutePath());
-    
+    Path pidFile = new Path(workDir, "pid.txt");
+
+    exec.activateContainer(cId, pidFile);
     return exec.launchContainer(container, scriptPath, tokensPath,
         appSubmitter, appId, workDir);
   }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java Thu Oct 27 12:06:08 2011
@@ -60,7 +60,8 @@ public class TestLinuxContainerExecutorW
   
   private List<String> readMockParams() throws IOException {
     LinkedList<String> ret = new LinkedList<String>();
-    LineNumberReader reader = new LineNumberReader(new FileReader(mockParamFile));
+    LineNumberReader reader = new LineNumberReader(new FileReader(
+        mockParamFile));
     String line;
     while((line = reader.readLine()) != null) {
       ret.add(line);
@@ -70,7 +71,7 @@ public class TestLinuxContainerExecutorW
   }
   
   @Before
-  public void setup() throws IOException {
+  public void setup() {
     File f = new File("./src/test/resources/mock-container-executor");
     if(!f.canExecute()) {
       f.setExecutable(true);
@@ -83,7 +84,7 @@ public class TestLinuxContainerExecutorW
   }
 
   @After
-  public void tearDown() throws IOException {
+  public void tearDown() {
     deleteMockParamFile();
   }
   
@@ -109,11 +110,14 @@ public class TestLinuxContainerExecutorW
     Path scriptPath = new Path("file:///bin/echo");
     Path tokensPath = new Path("file:///dev/null");
     Path workDir = new Path("/tmp");
-    int ret = mockExec.launchContainer(container, scriptPath, tokensPath, 
+    Path pidFile = new Path(workDir, "pid.txt");
+
+    mockExec.activateContainer(cId, pidFile);
+    int ret = mockExec.launchContainer(container, scriptPath, tokensPath,
         appSubmitter, appId, workDir);
     assertEquals(0, ret);
-    assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId, 
-        workDir.toString(), "/bin/echo", "/dev/null"),
+    assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId,
+        workDir.toString(), "/bin/echo", "/dev/null", pidFile),
         readMockParams());
   }
 
@@ -141,4 +145,4 @@ public class TestLinuxContainerExecutorW
     assertEquals(Arrays.asList(appSubmitter, cmd, "/tmp/testdir"),
         readMockParams());
   }
-}
\ No newline at end of file
+}

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java Thu Oct 27 12:06:08 2011
@@ -280,7 +280,7 @@ public class TestContainerManager extend
     gcsRequest.setContainerId(cId);
     ContainerStatus containerStatus = 
         containerManager.getContainerStatus(gcsRequest).getStatus();
-    Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+    Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
         containerStatus.getExitStatus());
 
     // Assert that the process is not alive anymore

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java Thu Oct 27 12:06:08 2011
@@ -168,7 +168,7 @@ public class TestContainer {
       wc.localizeResources();
       wc.launchContainer();
       reset(wc.localizerBus);
-      wc.containerFailed(ExitCode.KILLED.getExitCode());
+      wc.containerFailed(ExitCode.FORCE_KILLED.getExitCode());
       assertEquals(ContainerState.EXITED_WITH_FAILURE, 
           wc.c.getContainerState());
       verifyCleanupCall(wc);
@@ -268,6 +268,26 @@ public class TestContainer {
     }
   }
 
+  @Test
+  public void testLaunchAfterKillRequest() throws Exception {
+    WrappedContainer wc = null;
+    try {
+      wc = new WrappedContainer(14, 314159265358979L, 4344, "yak");
+      wc.initContainer();
+      wc.localizeResources();
+      wc.killContainer();
+      assertEquals(ContainerState.KILLING, wc.c.getContainerState());
+      wc.launchContainer();
+      assertEquals(ContainerState.KILLING, wc.c.getContainerState());
+      wc.containerKilledOnRequest();
+      verifyCleanupCall(wc);
+    } finally {
+      if (wc != null) {
+        wc.finished();
+      }
+    }
+  }
+  
   private void verifyCleanupCall(WrappedContainer wc) throws Exception {
     ResourcesReleasedMatcher matchesReq =
         new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(
@@ -511,7 +531,7 @@ public class TestContainer {
 
     public void containerKilledOnRequest() {
       c.handle(new ContainerExitEvent(cId,
-          ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.KILLED
+          ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.FORCE_KILLED
               .getExitCode()));
       drainDispatcherEvents();
     }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java Thu Oct 27 12:06:08 2011
@@ -32,7 +32,6 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.regex.Pattern;
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
@@ -59,7 +58,6 @@ import org.apache.hadoop.yarn.server.nod
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
 import org.apache.hadoop.yarn.util.ConverterUtils;
 import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
-import org.apache.hadoop.yarn.util.ProcfsBasedProcessTree;
 import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
 import org.junit.Before;
 import org.junit.Test;
@@ -76,22 +74,21 @@ public class TestContainerLaunch extends
     conf.setClass(
         YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR,
         LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class);
+    conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 1000);
     super.setup();
   }
 
   @Test
   public void testSpecialCharSymlinks() throws IOException  {
 
-    String rootDir = new File(System.getProperty(
-        "test.build.data", "/tmp")).getAbsolutePath();
     File shellFile = null;
     File tempFile = null;
     String badSymlink = "foo@zz%_#*&!-+= bar()";
     File symLinkFile = null;
 
     try {
-      shellFile = new File(rootDir, "hello.sh");
-      tempFile = new File(rootDir, "temp.sh");
+      shellFile = new File(tmpDir, "hello.sh");
+      tempFile = new File(tmpDir, "temp.sh");
       String timeoutCommand = "echo \"hello\"";
       PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));    
       shellFile.setExecutable(true);
@@ -113,15 +110,14 @@ public class TestContainerLaunch extends
       fos.close();
       tempFile.setExecutable(true);
 
-      File rootDirFile = new File(rootDir);
       Shell.ShellCommandExecutor shexc 
-      = new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, rootDirFile);
+      = new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, tmpDir);
 
       shexc.execute();
       assertEquals(shexc.getExitCode(), 0);
       assert(shexc.getOutput().contains("hello"));
 
-      symLinkFile = new File(rootDir, badSymlink);      
+      symLinkFile = new File(tmpDir, badSymlink);      
     }
     finally {
       // cleanup
@@ -141,6 +137,7 @@ public class TestContainerLaunch extends
   }
   
   // this is a dirty hack - but should be ok for a unittest.
+  @SuppressWarnings({ "rawtypes", "unchecked" })
   public static void setNewEnvironmentHack(Map<String, String> newenv) throws Exception {
     Class[] classes = Collections.class.getDeclaredClasses();
     Map<String, String> env = System.getenv();
@@ -162,7 +159,6 @@ public class TestContainerLaunch extends
    */
   @Test
   public void testContainerEnvVariables() throws Exception {
-    int exitCode = 0;
     containerManager.start();
 
     Map<String, String> envWithDummy = new HashMap<String, String>();
@@ -217,7 +213,7 @@ public class TestContainerLaunch extends
         new HashMap<String, LocalResource>();
     localResources.put(destinationFile, rsrc_alpha);
     containerLaunchContext.setLocalResources(localResources);
-    
+
     // set up the rest of the container
     containerLaunchContext.setUser(containerLaunchContext.getUser());
     List<String> commands = new ArrayList<String>();
@@ -226,11 +222,11 @@ public class TestContainerLaunch extends
     containerLaunchContext.setCommands(commands);
     containerLaunchContext.setResource(recordFactory
         .newRecordInstance(Resource.class));
-    containerLaunchContext.getResource().setMemory(100 * 1024 * 1024);
+    containerLaunchContext.getResource().setMemory(1024);
     StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
     startRequest.setContainerLaunchContext(containerLaunchContext);
     containerManager.startContainer(startRequest);
- 
+
     int timeoutSecs = 0;
     while (!processStartFile.exists() && timeoutSecs++ < 20) {
       Thread.sleep(1000);
@@ -238,7 +234,7 @@ public class TestContainerLaunch extends
     }
     Assert.assertTrue("ProcessStartFile doesn't exist!",
         processStartFile.exists());
-    
+
     // Now verify the contents of the file
     BufferedReader reader =
         new BufferedReader(new FileReader(processStartFile));
@@ -265,13 +261,13 @@ public class TestContainerLaunch extends
 
     BaseContainerManagerTest.waitForContainerState(containerManager, cId,
         ContainerState.COMPLETE);
-    
+
     GetContainerStatusRequest gcsRequest = 
         recordFactory.newRecordInstance(GetContainerStatusRequest.class);
     gcsRequest.setContainerId(cId);
     ContainerStatus containerStatus = 
         containerManager.getContainerStatus(gcsRequest).getStatus();
-    Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+    Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
         containerStatus.getExitStatus());
 
     // Assert that the process is not alive anymore
@@ -279,4 +275,119 @@ public class TestContainerLaunch extends
         exec.signalContainer(user,
             pid, Signal.NULL));
   }
+
+  @Test
+  public void testDelayedKill() throws Exception {
+    containerManager.start();
+
+    File processStartFile =
+        new File(tmpDir, "pid.txt").getAbsoluteFile();
+
+    // setup a script that can handle sigterm gracefully
+    File scriptFile = new File(tmpDir, "testscript.sh");
+    PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
+    writer.println("#!/bin/bash\n\n");
+    writer.println("echo \"Running testscript for delayed kill\"");
+    writer.println("hello=\"Got SIGTERM\"");
+    writer.println("umask 0");
+    writer.println("trap \"echo $hello >> " + processStartFile + "\" SIGTERM");
+    writer.println("echo \"Writing pid to start file\"");
+    writer.println("echo $$ >> " + processStartFile);
+    writer.println("while true; do\nsleep 1s;\ndone");
+    writer.close();
+    scriptFile.setExecutable(true);
+
+    ContainerLaunchContext containerLaunchContext = 
+        recordFactory.newRecordInstance(ContainerLaunchContext.class);
+
+    // ////// Construct the Container-id
+    ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
+    appId.setClusterTimestamp(1);
+    appId.setId(1);
+    ApplicationAttemptId appAttemptId = 
+        recordFactory.newRecordInstance(ApplicationAttemptId.class);
+    appAttemptId.setApplicationId(appId);
+    appAttemptId.setAttemptId(1);
+    ContainerId cId = 
+        recordFactory.newRecordInstance(ContainerId.class);
+    cId.setApplicationAttemptId(appAttemptId);
+    containerLaunchContext.setContainerId(cId);
+
+    containerLaunchContext.setUser(user);
+
+    // upload the script file so that the container can run it
+    URL resource_alpha =
+        ConverterUtils.getYarnUrlFromPath(localFS
+            .makeQualified(new Path(scriptFile.getAbsolutePath())));
+    LocalResource rsrc_alpha =
+        recordFactory.newRecordInstance(LocalResource.class);
+    rsrc_alpha.setResource(resource_alpha);
+    rsrc_alpha.setSize(-1);
+    rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
+    rsrc_alpha.setType(LocalResourceType.FILE);
+    rsrc_alpha.setTimestamp(scriptFile.lastModified());
+    String destinationFile = "dest_file.sh";
+    Map<String, LocalResource> localResources = 
+        new HashMap<String, LocalResource>();
+    localResources.put(destinationFile, rsrc_alpha);
+    containerLaunchContext.setLocalResources(localResources);
+
+    // set up the rest of the container
+    containerLaunchContext.setUser(containerLaunchContext.getUser());
+    List<String> commands = new ArrayList<String>();
+    commands.add(scriptFile.getAbsolutePath());
+    containerLaunchContext.setCommands(commands);
+    containerLaunchContext.setResource(recordFactory
+        .newRecordInstance(Resource.class));
+    containerLaunchContext.getResource().setMemory(1024);
+    StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
+    startRequest.setContainerLaunchContext(containerLaunchContext);
+    containerManager.startContainer(startRequest);
+
+    int timeoutSecs = 0;
+    while (!processStartFile.exists() && timeoutSecs++ < 20) {
+      Thread.sleep(1000);
+      LOG.info("Waiting for process start-file to be created");
+    }
+    Assert.assertTrue("ProcessStartFile doesn't exist!",
+        processStartFile.exists());
+
+    // Now test the stop functionality.
+    StopContainerRequest stopRequest = recordFactory.newRecordInstance(StopContainerRequest.class);
+    stopRequest.setContainerId(cId);
+    containerManager.stopContainer(stopRequest);
+
+    BaseContainerManagerTest.waitForContainerState(containerManager, cId,
+        ContainerState.COMPLETE);
+
+    // container stop sends a sigterm followed by a sigkill
+    GetContainerStatusRequest gcsRequest = 
+        recordFactory.newRecordInstance(GetContainerStatusRequest.class);
+    gcsRequest.setContainerId(cId);
+    ContainerStatus containerStatus = 
+        containerManager.getContainerStatus(gcsRequest).getStatus();
+    Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
+        containerStatus.getExitStatus());
+
+    // Now verify the contents of the file
+    // Script generates a message when it receives a sigterm
+    // so we look for that
+    BufferedReader reader =
+        new BufferedReader(new FileReader(processStartFile));
+
+    boolean foundSigTermMessage = false;
+    while (true) {
+      String line = reader.readLine();
+      if (line == null) {
+        break;
+      }
+      if (line.contains("SIGTERM")) {
+        foundSigTermMessage = true;
+        break;
+      }
+    }
+    Assert.assertTrue("Did not find sigterm message", foundSigTermMessage);
+    reader.close();
+  }
+
 }

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java Thu Oct 27 12:06:08 2011
@@ -262,7 +262,7 @@ public class TestContainersMonitor exten
     gcsRequest.setContainerId(cId);
     ContainerStatus containerStatus =
         containerManager.getContainerStatus(gcsRequest).getStatus();
-    Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+    Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
         containerStatus.getExitStatus());
     String expectedMsgPattern =
         "Container \\[pid=" + pid + ",containerID=" + cId

Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor Thu Oct 27 12:06:08 2011
@@ -5,6 +5,6 @@ do
 done > params.txt
 if [[ "$2" == "1" ]];
 then
-  cd $5;
-  exec $6;
+  cd $6;
+  exec $7;
 fi;