You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by vi...@apache.org on 2011/10/27 14:06:09 UTC
svn commit: r1189713 - in
/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project: ./
hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/
hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/...
Author: vinodkv
Date: Thu Oct 27 12:06:08 2011
New Revision: 1189713
URL: http://svn.apache.org/viewvc?rev=1189713&view=rev
Log:
MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its containers (process-trees) irrespective of whether the container succeeded, or killed. Contributed by Hitesh Shah.
Modified:
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java
hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt Thu Oct 27 12:06:08 2011
@@ -1743,6 +1743,10 @@ Release 0.23.0 - Unreleased
MAPREDUCE-3279. Fixed TestJobHistoryParsing which assumed user name to be
mapred all the time. (Siddharth Seth via acmurthy)
+ MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its
+ containers (process-trees) irrespective of whether the container succeeded,
+ or killed. Contributed by Hitesh Shah.
+
Release 0.22.0 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java Thu Oct 27 12:06:08 2011
@@ -166,8 +166,6 @@ public abstract class RMContainerRequest
for (ResourceRequest req : reqMap.values()) {
if (!ask.remove(req)) {
foundAll = false;
- }
- else {
// if ask already sent to RM, we can try and overwrite it if possible.
// send a new ask to RM with numContainers
// specified for the blacklisted host to be 0.
@@ -181,7 +179,7 @@ public abstract class RMContainerRequest
// we can remove this request
if (foundAll) {
remoteRequests.remove(hostName);
- }
+ }
}
// TODO handling of rack blacklisting
// Removing from rack should be dependent on no. of failures within the rack
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java Thu Oct 27 12:06:08 2011
@@ -411,6 +411,20 @@ public class YarnConfiguration extends C
YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCE_LOCALIZER =
"security.resourcelocalizer.protocol.acl";
+ /** No. of milliseconds to wait between sending a SIGTERM and SIGKILL
+ * to a running container */
+ public static final String NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
+ NM_PREFIX + "sleep-delay-before-sigkill.ms";
+ public static final long DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
+ 250;
+
+ /** Max time to wait for a process to come up when trying to cleanup
+ * container resources */
+ public static final String NM_PROCESS_KILL_WAIT_MS =
+ NM_PREFIX + "process-kill-wait.ms";
+ public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
+ 2000;
+
public YarnConfiguration() {
super();
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml Thu Oct 27 12:06:08 2011
@@ -366,6 +366,18 @@
<!-- <value>mapreduce.shuffle</value> -->
</property>
+ <property>
+ <description>No. of ms to wait between sending a SIGTERM and SIGKILL to a container</description>
+ <name>yarn.nodemanager.sleep-delay-before-sigkill.ms</name>
+ <value>250</value>
+ </property>
+
+ <property>
+ <description>Max time to wait for a process to come up when trying to cleanup a container</description>
+ <name>yarn.nodemanager.process-kill-wait.ms</name>
+ <value>2000</value>
+ </property>
+
<!--Map Reduce configuration-->
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.c Thu Oct 27 12:06:08 2011
@@ -45,6 +45,9 @@ FILE* ERRORFILE = NULL;
static uid_t nm_uid = -1;
static gid_t nm_gid = -1;
+char *concatenate(char *concat_pattern, char *return_path_name,
+ int numArgs, ...);
+
void set_nm_uid(uid_t user, gid_t group) {
nm_uid = user;
nm_gid = group;
@@ -148,6 +151,60 @@ static int change_effective_user(uid_t u
}
/**
+ * Write the pid of the current process into the pid file.
+ * pid_file: Path to pid file where pid needs to be written to
+ */
+static int write_pid_to_file_as_nm(const char* pid_file, pid_t pid) {
+ uid_t user = geteuid();
+ gid_t group = getegid();
+ if (change_effective_user(nm_uid, nm_gid) != 0) {
+ return -1;
+ }
+
+ char *temp_pid_file = concatenate("%s.tmp", "pid_file_path", 1, pid_file);
+
+ // create with 700
+ int pid_fd = open(temp_pid_file, O_WRONLY|O_CREAT|O_EXCL, S_IRWXU);
+ if (pid_fd == -1) {
+ fprintf(LOGFILE, "Can't open file %s as node manager - %s\n", temp_pid_file,
+ strerror(errno));
+ free(temp_pid_file);
+ return -1;
+ }
+
+ // write pid to temp file
+ char pid_buf[21];
+ snprintf(pid_buf, 21, "%d", pid);
+ ssize_t written = write(pid_fd, pid_buf, strlen(pid_buf));
+ close(pid_fd);
+ if (written == -1) {
+ fprintf(LOGFILE, "Failed to write pid to file %s as node manager - %s\n",
+ temp_pid_file, strerror(errno));
+ free(temp_pid_file);
+ return -1;
+ }
+
+ // rename temp file to actual pid file
+ // use rename as atomic
+ if (rename(temp_pid_file, pid_file)) {
+ fprintf(LOGFILE, "Can't move pid file from %s to %s as node manager - %s\n",
+ temp_pid_file, pid_file, strerror(errno));
+ unlink(temp_pid_file);
+ free(temp_pid_file);
+ return -1;
+ }
+
+ // Revert back to the calling user.
+ if (change_effective_user(user, group)) {
+ free(temp_pid_file);
+ return -1;
+ }
+
+ free(temp_pid_file);
+ return 0;
+}
+
+/**
* Change the real and effective user and group to abandon the super user
* priviledges.
*/
@@ -749,7 +806,8 @@ int initialize_app(const char *user, con
int launch_container_as_user(const char *user, const char *app_id,
const char *container_id, const char *work_dir,
- const char *script_name, const char *cred_file) {
+ const char *script_name, const char *cred_file,
+ const char* pid_file) {
int exit_code = -1;
char *script_file_dest = NULL;
char *cred_file_dest = NULL;
@@ -776,6 +834,20 @@ int launch_container_as_user(const char
goto cleanup;
}
+ // setsid
+ pid_t pid = setsid();
+ if (pid == -1) {
+ exit_code = SETSID_OPER_FAILED;
+ goto cleanup;
+ }
+
+ // write pid to pidfile
+ if (pid_file == NULL
+ || write_pid_to_file_as_nm(pid_file, pid) != 0) {
+ exit_code = WRITE_PIDFILE_FAILED;
+ goto cleanup;
+ }
+
// give up root privs
if (change_user(user_detail->pw_uid, user_detail->pw_gid) != 0) {
exit_code = SETUID_OPER_FAILED;
@@ -1031,3 +1103,5 @@ int delete_as_user(const char *user,
}
return ret;
}
+
+
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/container-executor.h Thu Oct 27 12:06:08 2011
@@ -51,7 +51,9 @@ enum errorcodes {
UNABLE_TO_BUILD_PATH, //21
INVALID_CONTAINER_EXEC_PERMISSIONS, //22
// PREPARE_JOB_LOGS_FAILED (NOT USED) 23
- INVALID_CONFIG_FILE = 24
+ INVALID_CONFIG_FILE = 24,
+ SETSID_OPER_FAILED = 25,
+ WRITE_PIDFILE_FAILED = 26
};
#define NM_GROUP_KEY "yarn.nodemanager.linux-container-executor.group"
@@ -106,11 +108,13 @@ int initialize_app(const char *user, con
* @param script_name the name of the script to be run to launch the container.
* @param cred_file the credentials file that needs to be compied to the
* working directory.
+ * @param pid_file file where pid of process should be written to
* @return -1 or errorcode enum value on error (should never return on success).
*/
int launch_container_as_user(const char * user, const char *app_id,
const char *container_id, const char *work_dir,
- const char *script_name, const char *cred_file);
+ const char *script_name, const char *cred_file,
+ const char *pid_file);
/**
* Function used to signal a container launched by the user.
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/impl/main.c Thu Oct 27 12:06:08 2011
@@ -66,6 +66,7 @@ int main(int argc, char **argv) {
const char * cred_file = NULL;
const char * script_file = NULL;
const char * current_dir = NULL;
+ const char * pid_file = NULL;
int exit_code = 0;
@@ -141,7 +142,7 @@ int main(int argc, char **argv) {
argv + optind);
break;
case LAUNCH_CONTAINER:
- if (argc < 8) {
+ if (argc < 9) {
fprintf(ERRORFILE, "Too few arguments (%d vs 8) for launch container\n",
argc);
fflush(ERRORFILE);
@@ -152,8 +153,9 @@ int main(int argc, char **argv) {
current_dir = argv[optind++];
script_file = argv[optind++];
cred_file = argv[optind++];
+ pid_file = argv[optind++];
exit_code = launch_container_as_user(user_detail->pw_name, app_id, container_id,
- current_dir, script_file, cred_file);
+ current_dir, script_file, cred_file, pid_file);
break;
case SIGNAL_CONTAINER:
if (argc < 5) {
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/c/container-executor/test/test-container-executor.c Thu Oct 27 12:06:08 2011
@@ -590,6 +590,7 @@ void test_run_container() {
fflush(stderr);
char* container_dir = get_container_work_directory(TEST_ROOT "/local-1",
username, "app_4", "container_1");
+ const char * pid_file = TEST_ROOT "/pid.txt";
pid_t child = fork();
if (child == -1) {
printf("FAIL: failed to fork process for init_app - %s\n",
@@ -597,7 +598,7 @@ void test_run_container() {
exit(1);
} else if (child == 0) {
if (launch_container_as_user(username, "app_4", "container_1",
- container_dir, script_name, TEST_ROOT "creds.txt") != 0) {
+ container_dir, script_name, TEST_ROOT "/creds.txt", pid_file) != 0) {
printf("FAIL: failed in child\n");
exit(42);
}
@@ -631,6 +632,32 @@ void test_run_container() {
exit(1);
}
free(container_dir);
+
+ if(access(pid_file, R_OK) != 0) {
+ printf("FAIL: failed to create pid file %s\n", pid_file);
+ exit(1);
+ }
+ int pidfd = open(pid_file, O_RDONLY);
+ if (pidfd == -1) {
+ printf("FAIL: failed to open pid file %s - %s\n", pid_file, strerror(errno));
+ exit(1);
+ }
+
+ char pidBuf[100];
+ ssize_t bytes = read(pidfd, pidBuf, 100);
+ if (bytes == -1) {
+ printf("FAIL: failed to read from pid file %s - %s\n", pid_file, strerror(errno));
+ exit(1);
+ }
+
+ pid_t mypid = child;
+ char myPidBuf[33];
+ snprintf(myPidBuf, 33, "%d", mypid);
+ if (strncmp(pidBuf, myPidBuf, strlen(myPidBuf)) != 0) {
+ printf("FAIL: failed to find matching pid in pid file\n");
+ printf("FAIL: Expected pid %d : Got %.*s", mypid, (int)bytes, pidBuf);
+ exit(1);
+ }
}
int main(int argc, char **argv) {
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -19,12 +19,13 @@
package org.apache.hadoop.yarn.server.nodemanager;
import java.io.IOException;
-import java.lang.reflect.Field;
-
import java.net.InetSocketAddress;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -35,6 +36,7 @@ import org.apache.hadoop.fs.permission.F
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
public abstract class ContainerExecutor implements Configurable {
@@ -43,8 +45,12 @@ public abstract class ContainerExecutor
FsPermission.createImmutable((short) 0700);
private Configuration conf;
- protected ConcurrentMap<ContainerId, ShellCommandExecutor> launchCommandObjs =
- new ConcurrentHashMap<ContainerId, ShellCommandExecutor>();
+ private ConcurrentMap<ContainerId, Path> pidFiles =
+ new ConcurrentHashMap<ContainerId, Path>();
+
+ private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+ private final ReadLock readLock = lock.readLock();
+ private final WriteLock writeLock = lock.writeLock();
@Override
public void setConf(Configuration conf) {
@@ -102,7 +108,8 @@ public abstract class ContainerExecutor
throws IOException, InterruptedException;
public enum ExitCode {
- KILLED(137);
+ FORCE_KILLED(137),
+ TERMINATED(143);
private final int code;
private ExitCode(int exitCode) {
@@ -150,6 +157,66 @@ public abstract class ContainerExecutor
}
/**
+ * Get the pidFile of the container.
+ * @param containerId
+ * @return the path of the pid-file for the given containerId.
+ */
+ protected Path getPidFilePath(ContainerId containerId) {
+ try {
+ readLock.lock();
+ return (this.pidFiles.get(containerId));
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ /**
+ * Is the container still active?
+ * @param containerId
+ * @return true if the container is active else false.
+ */
+ protected boolean isContainerActive(ContainerId containerId) {
+ try {
+ readLock.lock();
+ return (this.pidFiles.containsKey(containerId));
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ /**
+ * Mark the container as active
+ *
+ * @param containerId
+ * the ContainerId
+ * @param pidFilePath
+ * Path where the executor should write the pid of the launched
+ * process
+ */
+ public void activateContainer(ContainerId containerId, Path pidFilePath) {
+ try {
+ writeLock.lock();
+ this.pidFiles.put(containerId, pidFilePath);
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ /**
+ * Mark the container as inactive.
+ * Done iff the container is still active. Else treat it as
+ * a no-op
+ */
+ public void deactivateContainer(ContainerId containerId) {
+ try {
+ writeLock.lock();
+ this.pidFiles.remove(containerId);
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ /**
* Get the process-identifier for the container
*
* @param containerID
@@ -158,28 +225,15 @@ public abstract class ContainerExecutor
*/
public String getProcessId(ContainerId containerID) {
String pid = null;
- ShellCommandExecutor shExec = launchCommandObjs.get(containerID);
- if (shExec == null) {
+ Path pidFile = pidFiles.get(containerID);
+ if (pidFile == null) {
// This container isn't even launched yet.
return pid;
}
- Process proc = shExec.getProcess();
- if (proc == null) {
- // This happens if the command is not yet started
- return pid;
- }
try {
- Field pidField = proc.getClass().getDeclaredField("pid");
- pidField.setAccessible(true);
- pid = ((Integer) pidField.get(proc)).toString();
- } catch (SecurityException e) {
- // SecurityManager not expected with yarn. Ignore.
- } catch (NoSuchFieldException e) {
- // Yarn only on UNIX for now. Ignore.
- } catch (IllegalArgumentException e) {
- ;
- } catch (IllegalAccessException e) {
- ;
+ pid = ProcessIdFileReader.getProcessId(pidFile);
+ } catch (IOException e) {
+ LOG.error("Got exception reading pid from pid-file " + pidFile, e);
}
return pid;
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -18,10 +18,16 @@
package org.apache.hadoop.yarn.server.nodemanager;
+import static org.apache.hadoop.fs.CreateFlag.CREATE;
+import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
+
+import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
+import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.util.Arrays;
+import java.util.EnumSet;
import java.util.List;
import org.apache.commons.logging.Log;
@@ -48,6 +54,9 @@ public class DefaultContainerExecutor ex
private final FileContext lfs;
+ private static final String WRAPPER_LAUNCH_SCRIPT =
+ "default_container_executor.sh";
+
public DefaultContainerExecutor() {
try {
this.lfs = FileContext.getLocalFSFileContext();
@@ -100,8 +109,9 @@ public class DefaultContainerExecutor ex
ConverterUtils.toString(
container.getContainerID().getApplicationAttemptId().
getApplicationId());
- String[] sLocalDirs =
- getConf().getStrings(YarnConfiguration.NM_LOCAL_DIRS, YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
+ String[] sLocalDirs = getConf().getStrings(
+ YarnConfiguration.NM_LOCAL_DIRS,
+ YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
for (String sLocalDir : sLocalDirs) {
Path usersdir = new Path(sLocalDir, ContainerLocalizer.USERCACHE);
Path userdir = new Path(usersdir, userName);
@@ -124,21 +134,47 @@ public class DefaultContainerExecutor ex
new Path(containerWorkDir, ContainerLaunch.FINAL_CONTAINER_TOKENS_FILE);
lfs.util().copy(nmPrivateTokensPath, tokenDst);
+ // Create new local launch wrapper script
+ Path wrapperScriptDst = new Path(containerWorkDir, WRAPPER_LAUNCH_SCRIPT);
+ DataOutputStream wrapperScriptOutStream =
+ lfs.create(wrapperScriptDst,
+ EnumSet.of(CREATE, OVERWRITE));
+
+ Path pidFile = getPidFilePath(containerId);
+ if (pidFile != null) {
+ writeLocalWrapperScript(wrapperScriptOutStream, launchDst.toUri()
+ .getPath().toString(), pidFile.toString());
+ } else {
+ LOG.info("Container " + containerIdStr
+ + " was marked as inactive. Returning terminated error");
+ return ExitCode.TERMINATED.getExitCode();
+ }
+
// create log dir under app
// fork script
ShellCommandExecutor shExec = null;
try {
lfs.setPermission(launchDst,
ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
- String[] command =
- new String[] { "bash", "-c", launchDst.toUri().getPath().toString() };
+ lfs.setPermission(wrapperScriptDst,
+ ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
+
+ // Setup command to run
+ String[] command = {"bash", "-c",
+ wrapperScriptDst.toUri().getPath().toString()};
LOG.info("launchContainer: " + Arrays.toString(command));
shExec = new ShellCommandExecutor(
command,
- new File(containerWorkDir.toUri().getPath()),
+ new File(containerWorkDir.toUri().getPath()),
container.getLaunchContext().getEnvironment()); // sanitized env
- launchCommandObjs.put(containerId, shExec);
- shExec.execute();
+ if (isContainerActive(containerId)) {
+ shExec.execute();
+ }
+ else {
+ LOG.info("Container " + containerIdStr +
+ " was marked as inactive. Returning terminated error");
+ return ExitCode.TERMINATED.getExitCode();
+ }
} catch (IOException e) {
if (null == shExec) {
return -1;
@@ -151,17 +187,44 @@ public class DefaultContainerExecutor ex
message));
return exitCode;
} finally {
- launchCommandObjs.remove(containerId);
+ ; //
}
return 0;
}
+ private void writeLocalWrapperScript(DataOutputStream out,
+ String launchScriptDst, String pidFilePath) throws IOException {
+ // We need to do a move as writing to a file is not atomic
+ // Process reading a file being written to may get garbled data
+ // hence write pid to tmp file first followed by a mv
+ StringBuilder sb = new StringBuilder("#!/bin/bash\n\n");
+ sb.append("echo $$ > " + pidFilePath + ".tmp\n");
+ sb.append("/bin/mv -f " + pidFilePath + ".tmp " + pidFilePath + "\n");
+ sb.append(ContainerExecutor.isSetsidAvailable? "exec setsid" : "exec");
+ sb.append(" /bin/bash ");
+ sb.append("-c ");
+ sb.append("\"");
+ sb.append(launchScriptDst);
+ sb.append("\"\n");
+ PrintStream pout = null;
+ try {
+ pout = new PrintStream(out);
+ pout.append(sb);
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ }
+
@Override
public boolean signalContainer(String user, String pid, Signal signal)
throws IOException {
final String sigpid = ContainerExecutor.isSetsidAvailable
? "-" + pid
: pid;
+ LOG.debug("Sending signal " + signal.getValue() + " to pid " + sigpid
+ + " as user " + user);
try {
sendSignal(sigpid, Signal.NULL);
} catch (ExitCodeException e) {
@@ -189,8 +252,8 @@ public class DefaultContainerExecutor ex
*/
protected void sendSignal(String pid, Signal signal) throws IOException {
ShellCommandExecutor shexec = null;
- String[] arg = { "kill", "-" + signal.getValue(), pid };
- shexec = new ShellCommandExecutor(arg);
+ String[] arg = { "kill", "-" + signal.getValue(), pid };
+ shexec = new ShellCommandExecutor(arg);
shexec.execute();
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -155,36 +155,45 @@ public class LinuxContainerExecutor exte
ContainerId containerId = container.getContainerID();
String containerIdStr = ConverterUtils.toString(containerId);
- List<String> command = new ArrayList<String>(
- Arrays.asList(containerExecutorExe,
- user,
- Integer.toString(Commands.LAUNCH_CONTAINER.getValue()),
- appId,
- containerIdStr,
- containerWorkDir.toString(),
- nmPrivateCotainerScriptPath.toUri().getPath().toString(),
- nmPrivateTokensPath.toUri().getPath().toString()));
- String[] commandArray = command.toArray(new String[command.size()]);
- ShellCommandExecutor shExec =
- new ShellCommandExecutor(
- commandArray,
- null, // NM's cwd
- container.getLaunchContext().getEnvironment()); // sanitized env
- launchCommandObjs.put(containerId, shExec);
- // DEBUG
- LOG.info("launchContainer: " + Arrays.toString(commandArray));
+
+ ShellCommandExecutor shExec = null;
+
try {
- shExec.execute();
- if (LOG.isDebugEnabled()) {
- logOutput(shExec.getOutput());
+ Path pidFilePath = getPidFilePath(containerId);
+ if (pidFilePath != null) {
+ List<String> command = new ArrayList<String>(Arrays.asList(
+ containerExecutorExe, user, Integer
+ .toString(Commands.LAUNCH_CONTAINER.getValue()), appId,
+ containerIdStr, containerWorkDir.toString(),
+ nmPrivateCotainerScriptPath.toUri().getPath().toString(),
+ nmPrivateTokensPath.toUri().getPath().toString(), pidFilePath
+ .toString()));
+ String[] commandArray = command.toArray(new String[command.size()]);
+ shExec = new ShellCommandExecutor(commandArray, null, // NM's cwd
+ container.getLaunchContext().getEnvironment()); // sanitized env
+ // DEBUG
+ LOG.info("launchContainer: " + Arrays.toString(commandArray));
+ shExec.execute();
+ if (LOG.isDebugEnabled()) {
+ logOutput(shExec.getOutput());
+ }
+ } else {
+ LOG.info("Container was marked as inactive. Returning terminated error");
+ return ExitCode.TERMINATED.getExitCode();
}
} catch (ExitCodeException e) {
+
+ if (null == shExec) {
+ return -1;
+ }
+
int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode);
// 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
// terminated/killed forcefully. In all other cases, log the
// container-executor's output
- if (exitCode != 143 && exitCode != 137) {
+ if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
+ && exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch : ", e);
logOutput(shExec.getOutput());
String diagnostics = "Exception from container-launch: \n"
@@ -197,7 +206,7 @@ public class LinuxContainerExecutor exte
}
return exitCode;
} finally {
- launchCommandObjs.remove(containerId);
+ ; //
}
if (LOG.isDebugEnabled()) {
LOG.debug("Output from LinuxContainerExecutor's launchContainer follows:");
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java Thu Oct 27 12:06:08 2011
@@ -168,8 +168,6 @@ public class ContainerImpl implements Co
.addTransition(ContainerState.LOCALIZED, ContainerState.LOCALIZED,
ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
UPDATE_DIAGNOSTICS_TRANSITION)
- // TODO race: Can lead to a CONTAINER_LAUNCHED event at state KILLING,
- // and a container which will never be killed by the NM.
.addTransition(ContainerState.LOCALIZED, ContainerState.KILLING,
ContainerEventType.KILL_CONTAINER, new KillTransition())
@@ -239,6 +237,13 @@ public class ContainerImpl implements Co
ContainerState.DONE,
ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
CONTAINER_DONE_TRANSITION)
+ // Handle a launched container during killing stage is a no-op
+ // as cleanup container is always handled after launch container event
+ // in the container launcher
+ .addTransition(ContainerState.KILLING,
+ ContainerState.KILLING,
+ ContainerEventType.CONTAINER_LAUNCHED,
+ new ContainerTransition())
// From CONTAINER_CLEANEDUP_AFTER_KILL State.
.addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java Thu Oct 27 12:06:08 2011
@@ -31,6 +31,7 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -44,11 +45,14 @@ import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
+import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.DelayedProcessKiller;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
@@ -56,8 +60,10 @@ import org.apache.hadoop.yarn.server.nod
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
-import org.apache.hadoop.yarn.util.ConverterUtils;
+import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
import org.apache.hadoop.yarn.util.Apps;
+import org.apache.hadoop.yarn.util.ConverterUtils;
+
public class ContainerLaunch implements Callable<Integer> {
private static final Log LOG = LogFactory.getLog(ContainerLaunch.class);
@@ -65,12 +71,22 @@ public class ContainerLaunch implements
public static final String CONTAINER_SCRIPT = "launch_container.sh";
public static final String FINAL_CONTAINER_TOKENS_FILE = "container_tokens";
+ private static final String PID_FILE_NAME_FMT = "%s.pid";
+
private final Dispatcher dispatcher;
private final ContainerExecutor exec;
private final Application app;
private final Container container;
private final Configuration conf;
private final LocalDirAllocator logDirsSelector;
+
+ private volatile AtomicBoolean shouldLaunchContainer = new AtomicBoolean(false);
+ private volatile AtomicBoolean completed = new AtomicBoolean(false);
+
+ private long sleepDelayBeforeSigKill = 250;
+ private long maxKillWaitTime = 2000;
+
+ private Path pidFilePath = null;
public ContainerLaunch(Configuration configuration, Dispatcher dispatcher,
ContainerExecutor exec, Application app, Container container) {
@@ -80,6 +96,12 @@ public class ContainerLaunch implements
this.container = container;
this.dispatcher = dispatcher;
this.logDirsSelector = new LocalDirAllocator(YarnConfiguration.NM_LOG_DIRS);
+ this.sleepDelayBeforeSigKill =
+ conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
+ YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS);
+ this.maxKillWaitTime =
+ conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS,
+ YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS);
}
@Override
@@ -87,7 +109,8 @@ public class ContainerLaunch implements
public Integer call() {
final ContainerLaunchContext launchContext = container.getLaunchContext();
final Map<Path,String> localResources = container.getLocalizedResources();
- String containerIdStr = ConverterUtils.toString(container.getContainerID());
+ ContainerId containerID = container.getContainerID();
+ String containerIdStr = ConverterUtils.toString(containerID);
final String user = launchContext.getUser();
final List<String> command = launchContext.getCommands();
int ret = -1;
@@ -145,6 +168,17 @@ public class ContainerLaunch implements
+ ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr
+ Path.SEPARATOR + containerIdStr,
LocalDirAllocator.SIZE_UNKNOWN, this.conf, false);
+
+ String pidFileSuffix = String.format(ContainerLaunch.PID_FILE_NAME_FMT,
+ containerIdStr);
+
+ // pid file should be in nm private dir so that it is not
+ // accessible by users
+ pidFilePath = lDirAllocator.getLocalPathForWrite(
+ ResourceLocalizationService.NM_PRIVATE_DIR + Path.SEPARATOR
+ + pidFileSuffix,
+ this.conf);
+
try {
// /////////// Write out the container-script in the nmPrivate space.
String[] localDirs =
@@ -189,21 +223,36 @@ public class ContainerLaunch implements
// LaunchContainer is a blocking call. We are here almost means the
// container is launched, so send out the event.
dispatcher.getEventHandler().handle(new ContainerEvent(
- container.getContainerID(),
+ containerID,
ContainerEventType.CONTAINER_LAUNCHED));
- ret =
- exec.launchContainer(container, nmPrivateContainerScriptPath,
- nmPrivateTokensPath, user, appIdStr, containerWorkDir);
+ // Check if the container is signalled to be killed.
+ if (!shouldLaunchContainer.compareAndSet(false, true)) {
+ LOG.info("Container " + containerIdStr + " not launched as "
+ + "cleanup already called");
+ ret = ExitCode.TERMINATED.getExitCode();
+ }
+ else {
+ exec.activateContainer(containerID, pidFilePath);
+ ret =
+ exec.launchContainer(container, nmPrivateContainerScriptPath,
+ nmPrivateTokensPath, user, appIdStr, containerWorkDir);
+ }
} catch (Throwable e) {
LOG.warn("Failed to launch container", e);
dispatcher.getEventHandler().handle(new ContainerExitEvent(
launchContext.getContainerId(),
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret));
return ret;
+ } finally {
+ completed.set(true);
+ exec.deactivateContainer(containerID);
}
- if (ret == ExitCode.KILLED.getExitCode()) {
+ LOG.debug("Container " + containerIdStr + " completed with exit code "
+ + ret);
+ if (ret == ExitCode.FORCE_KILLED.getExitCode()
+ || ret == ExitCode.TERMINATED.getExitCode()) {
// If the process was killed, Send container_cleanedup_after_kill and
// just break out of this method.
dispatcher.getEventHandler().handle(
@@ -226,6 +275,114 @@ public class ContainerLaunch implements
ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
return 0;
}
+
+ /**
+ * Cleanup the container.
+ * Cancels the launch if launch has not started yet or signals
+ * the executor to not execute the process if not already done so.
+ * Also, sends a SIGTERM followed by a SIGKILL to the process if
+ * the process id is available.
+ * @throws IOException
+ */
+ public void cleanupContainer() throws IOException {
+ ContainerId containerId = container.getContainerID();
+ String containerIdStr = ConverterUtils.toString(containerId);
+ LOG.info("Cleaning up container " + containerIdStr);
+
+ // launch flag will be set to true if process already launched
+ boolean alreadyLaunched = !shouldLaunchContainer.compareAndSet(false, true);
+ if (!alreadyLaunched) {
+ LOG.info("Container " + containerIdStr + " not launched."
+ + " No cleanup needed to be done");
+ return;
+ }
+
+ LOG.debug("Marking container " + containerIdStr + " as inactive");
+ // this should ensure that if the container process has not launched
+ // by this time, it will never be launched
+ exec.deactivateContainer(containerId);
+
+ LOG.debug("Getting pid for container " + containerIdStr + " to kill"
+ + " from pid file "
+ + (pidFilePath != null ? pidFilePath.toString() : "null"));
+
+ // however the container process may have already started
+ try {
+
+ // get process id from pid file if available
+ // else if shell is still active, get it from the shell
+ String processId = null;
+ if (pidFilePath != null) {
+ processId = getContainerPid(pidFilePath);
+ }
+
+ // kill process
+ if (processId != null) {
+ String user = container.getLaunchContext().getUser();
+ LOG.debug("Sending signal to pid " + processId
+ + " as user " + user
+ + " for container " + containerIdStr);
+ if (sleepDelayBeforeSigKill > 0) {
+ boolean result = exec.signalContainer(user,
+ processId, Signal.TERM);
+ LOG.debug("Sent signal to pid " + processId
+ + " as user " + user
+ + " for container " + containerIdStr
+ + ", result=" + (result? "success" : "failed"));
+ new DelayedProcessKiller(user,
+ processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
+ }
+ }
+ } catch (Exception e) {
+ LOG.warn("Got error when trying to cleanup container " + containerIdStr
+ + ", error=" + e.getMessage());
+ } finally {
+ // cleanup pid file if present
+ if (pidFilePath != null) {
+ FileContext lfs = FileContext.getLocalFSFileContext();
+ lfs.delete(pidFilePath, false);
+ }
+ }
+ }
+
+ /**
+ * Loop through for a time-bounded interval waiting to
+ * read the process id from a file generated by a running process.
+ * @param pidFilePath File from which to read the process id
+ * @return Process ID
+ * @throws Exception
+ */
+ private String getContainerPid(Path pidFilePath) throws Exception {
+ String containerIdStr =
+ ConverterUtils.toString(container.getContainerID());
+ String processId = null;
+ LOG.debug("Accessing pid for container " + containerIdStr
+ + " from pid file " + pidFilePath);
+ int sleepCounter = 0;
+ final int sleepInterval = 100;
+
+ // loop waiting for pid file to show up
+ // until either the completed flag is set which means something bad
+ // happened or our timer expires in which case we admit defeat
+ while (!completed.get()) {
+ processId = ProcessIdFileReader.getProcessId(pidFilePath);
+ if (processId != null) {
+ LOG.debug("Got pid " + processId + " for container "
+ + containerIdStr);
+ break;
+ }
+ else if ((sleepCounter*sleepInterval) > maxKillWaitTime) {
+ LOG.info("Could not get pid for " + containerIdStr
+ + ". Waited for " + maxKillWaitTime + " ms.");
+ break;
+ }
+ else {
+ ++sleepCounter;
+ Thread.sleep(sleepInterval);
+ }
+ }
+ return processId;
+ }
private String getContainerPrivateDir(String appIdStr, String containerIdStr) {
return getAppPrivateDir(appIdStr) + Path.SEPARATOR + containerIdStr
@@ -287,7 +444,7 @@ public class ContainerLaunch implements
public String toString() {
return sb.toString();
}
-
+
}
private static void putEnvIfNotNull(
@@ -374,9 +531,9 @@ public class ContainerLaunch implements
sb.symlink(link.getKey(), link.getValue());
}
}
+
ArrayList<String> cmd = new ArrayList<String>(2 * command.size() + 5);
- cmd.add(ContainerExecutor.isSetsidAvailable ? "exec setsid " : "exec ");
- cmd.add("/bin/bash ");
+ cmd.add("exec /bin/bash ");
cmd.add("-c ");
cmd.add("\"");
for (String cs : command) {
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java Thu Oct 27 12:06:08 2011
@@ -26,15 +26,17 @@ import java.util.concurrent.ExecutorServ
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
-import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -52,6 +54,8 @@ import com.google.common.util.concurrent
public class ContainersLauncher extends AbstractService
implements EventHandler<ContainersLauncherEvent> {
+ private static final Log LOG = LogFactory.getLog(ContainersLauncher.class);
+
private final Context context;
private final ContainerExecutor exec;
private final Dispatcher dispatcher;
@@ -64,13 +68,14 @@ public class ContainersLauncher extends
Collections.synchronizedMap(new HashMap<ContainerId,RunningContainer>());
private static final class RunningContainer {
- public RunningContainer(String string, Future<Integer> submit) {
- this.user = string;
+ public RunningContainer(Future<Integer> submit,
+ ContainerLaunch launcher) {
this.runningcontainer = submit;
+ this.launcher = launcher;
}
- String user;
Future<Integer> runningcontainer;
+ ContainerLaunch launcher;
}
@@ -104,7 +109,6 @@ public class ContainersLauncher extends
// TODO: ContainersLauncher launches containers one by one!!
Container container = event.getContainer();
ContainerId containerId = container.getContainerID();
- String userName = container.getUser();
switch (event.getType()) {
case LAUNCH_CONTAINER:
Application app =
@@ -114,33 +118,26 @@ public class ContainersLauncher extends
new ContainerLaunch(getConfig(), dispatcher, exec, app,
event.getContainer());
running.put(containerId,
- new RunningContainer(userName,
- containerLauncher.submit(launch)));
+ new RunningContainer(containerLauncher.submit(launch),
+ launch));
break;
case CLEANUP_CONTAINER:
RunningContainer rContainerDatum = running.remove(containerId);
Future<Integer> rContainer = rContainerDatum.runningcontainer;
- if (rContainer != null) {
-
- if (rContainer.isDone()) {
- // The future is already done by this time.
- break;
- }
-
- // Cancel the future so that it won't be launched if it isn't already.
+ if (rContainer != null
+ && !rContainer.isDone()) {
+ // Cancel the future so that it won't be launched
+ // if it isn't already.
rContainer.cancel(false);
-
- // Kill the container
- String processId = exec.getProcessId(containerId);
- if (processId != null) {
- try {
- exec.signalContainer(rContainerDatum.user,
- processId, Signal.KILL);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
+ }
+
+ // Cleanup a container whether it is running/killed/completed, so that
+ // no sub-processes are alive.
+ try {
+ rContainerDatum.launcher.cleanupContainer();
+ } catch (IOException e) {
+ LOG.warn("Got exception while cleaning container " + containerId
+ + ". Ignoring.");
}
break;
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java Thu Oct 27 12:06:08 2011
@@ -185,7 +185,9 @@ public class TestLinuxContainerExecutor
Path scriptPath = new Path(script);
Path tokensPath = new Path("/dev/null");
Path workDir = new Path(workSpace.getAbsolutePath());
-
+ Path pidFile = new Path(workDir, "pid.txt");
+
+ exec.activateContainer(cId, pidFile);
return exec.launchContainer(container, scriptPath, tokensPath,
appSubmitter, appId, workDir);
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java Thu Oct 27 12:06:08 2011
@@ -60,7 +60,8 @@ public class TestLinuxContainerExecutorW
private List<String> readMockParams() throws IOException {
LinkedList<String> ret = new LinkedList<String>();
- LineNumberReader reader = new LineNumberReader(new FileReader(mockParamFile));
+ LineNumberReader reader = new LineNumberReader(new FileReader(
+ mockParamFile));
String line;
while((line = reader.readLine()) != null) {
ret.add(line);
@@ -70,7 +71,7 @@ public class TestLinuxContainerExecutorW
}
@Before
- public void setup() throws IOException {
+ public void setup() {
File f = new File("./src/test/resources/mock-container-executor");
if(!f.canExecute()) {
f.setExecutable(true);
@@ -83,7 +84,7 @@ public class TestLinuxContainerExecutorW
}
@After
- public void tearDown() throws IOException {
+ public void tearDown() {
deleteMockParamFile();
}
@@ -109,11 +110,14 @@ public class TestLinuxContainerExecutorW
Path scriptPath = new Path("file:///bin/echo");
Path tokensPath = new Path("file:///dev/null");
Path workDir = new Path("/tmp");
- int ret = mockExec.launchContainer(container, scriptPath, tokensPath,
+ Path pidFile = new Path(workDir, "pid.txt");
+
+ mockExec.activateContainer(cId, pidFile);
+ int ret = mockExec.launchContainer(container, scriptPath, tokensPath,
appSubmitter, appId, workDir);
assertEquals(0, ret);
- assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId,
- workDir.toString(), "/bin/echo", "/dev/null"),
+ assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId,
+ workDir.toString(), "/bin/echo", "/dev/null", pidFile),
readMockParams());
}
@@ -141,4 +145,4 @@ public class TestLinuxContainerExecutorW
assertEquals(Arrays.asList(appSubmitter, cmd, "/tmp/testdir"),
readMockParams());
}
-}
\ No newline at end of file
+}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java Thu Oct 27 12:06:08 2011
@@ -280,7 +280,7 @@ public class TestContainerManager extend
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
- Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+ Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
// Assert that the process is not alive anymore
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java Thu Oct 27 12:06:08 2011
@@ -168,7 +168,7 @@ public class TestContainer {
wc.localizeResources();
wc.launchContainer();
reset(wc.localizerBus);
- wc.containerFailed(ExitCode.KILLED.getExitCode());
+ wc.containerFailed(ExitCode.FORCE_KILLED.getExitCode());
assertEquals(ContainerState.EXITED_WITH_FAILURE,
wc.c.getContainerState());
verifyCleanupCall(wc);
@@ -268,6 +268,26 @@ public class TestContainer {
}
}
+ @Test
+ public void testLaunchAfterKillRequest() throws Exception {
+ WrappedContainer wc = null;
+ try {
+ wc = new WrappedContainer(14, 314159265358979L, 4344, "yak");
+ wc.initContainer();
+ wc.localizeResources();
+ wc.killContainer();
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
+ wc.launchContainer();
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
+ wc.containerKilledOnRequest();
+ verifyCleanupCall(wc);
+ } finally {
+ if (wc != null) {
+ wc.finished();
+ }
+ }
+ }
+
private void verifyCleanupCall(WrappedContainer wc) throws Exception {
ResourcesReleasedMatcher matchesReq =
new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(
@@ -511,7 +531,7 @@ public class TestContainer {
public void containerKilledOnRequest() {
c.handle(new ContainerExitEvent(cId,
- ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.KILLED
+ ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.FORCE_KILLED
.getExitCode()));
drainDispatcherEvents();
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java Thu Oct 27 12:06:08 2011
@@ -32,7 +32,6 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
@@ -59,7 +58,6 @@ import org.apache.hadoop.yarn.server.nod
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
-import org.apache.hadoop.yarn.util.ProcfsBasedProcessTree;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.junit.Before;
import org.junit.Test;
@@ -76,22 +74,21 @@ public class TestContainerLaunch extends
conf.setClass(
YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR,
LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class);
+ conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 1000);
super.setup();
}
@Test
public void testSpecialCharSymlinks() throws IOException {
- String rootDir = new File(System.getProperty(
- "test.build.data", "/tmp")).getAbsolutePath();
File shellFile = null;
File tempFile = null;
String badSymlink = "foo@zz%_#*&!-+= bar()";
File symLinkFile = null;
try {
- shellFile = new File(rootDir, "hello.sh");
- tempFile = new File(rootDir, "temp.sh");
+ shellFile = new File(tmpDir, "hello.sh");
+ tempFile = new File(tmpDir, "temp.sh");
String timeoutCommand = "echo \"hello\"";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
shellFile.setExecutable(true);
@@ -113,15 +110,14 @@ public class TestContainerLaunch extends
fos.close();
tempFile.setExecutable(true);
- File rootDirFile = new File(rootDir);
Shell.ShellCommandExecutor shexc
- = new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, rootDirFile);
+ = new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, tmpDir);
shexc.execute();
assertEquals(shexc.getExitCode(), 0);
assert(shexc.getOutput().contains("hello"));
- symLinkFile = new File(rootDir, badSymlink);
+ symLinkFile = new File(tmpDir, badSymlink);
}
finally {
// cleanup
@@ -141,6 +137,7 @@ public class TestContainerLaunch extends
}
// this is a dirty hack - but should be ok for a unittest.
+ @SuppressWarnings({ "rawtypes", "unchecked" })
public static void setNewEnvironmentHack(Map<String, String> newenv) throws Exception {
Class[] classes = Collections.class.getDeclaredClasses();
Map<String, String> env = System.getenv();
@@ -162,7 +159,6 @@ public class TestContainerLaunch extends
*/
@Test
public void testContainerEnvVariables() throws Exception {
- int exitCode = 0;
containerManager.start();
Map<String, String> envWithDummy = new HashMap<String, String>();
@@ -217,7 +213,7 @@ public class TestContainerLaunch extends
new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
-
+
// set up the rest of the container
containerLaunchContext.setUser(containerLaunchContext.getUser());
List<String> commands = new ArrayList<String>();
@@ -226,11 +222,11 @@ public class TestContainerLaunch extends
containerLaunchContext.setCommands(commands);
containerLaunchContext.setResource(recordFactory
.newRecordInstance(Resource.class));
- containerLaunchContext.getResource().setMemory(100 * 1024 * 1024);
+ containerLaunchContext.getResource().setMemory(1024);
StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest.setContainerLaunchContext(containerLaunchContext);
containerManager.startContainer(startRequest);
-
+
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
@@ -238,7 +234,7 @@ public class TestContainerLaunch extends
}
Assert.assertTrue("ProcessStartFile doesn't exist!",
processStartFile.exists());
-
+
// Now verify the contents of the file
BufferedReader reader =
new BufferedReader(new FileReader(processStartFile));
@@ -265,13 +261,13 @@ public class TestContainerLaunch extends
BaseContainerManagerTest.waitForContainerState(containerManager, cId,
ContainerState.COMPLETE);
-
+
GetContainerStatusRequest gcsRequest =
recordFactory.newRecordInstance(GetContainerStatusRequest.class);
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
- Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+ Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
// Assert that the process is not alive anymore
@@ -279,4 +275,119 @@ public class TestContainerLaunch extends
exec.signalContainer(user,
pid, Signal.NULL));
}
+
+ @Test
+ public void testDelayedKill() throws Exception {
+ containerManager.start();
+
+ File processStartFile =
+ new File(tmpDir, "pid.txt").getAbsoluteFile();
+
+ // setup a script that can handle sigterm gracefully
+ File scriptFile = new File(tmpDir, "testscript.sh");
+ PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
+ writer.println("#!/bin/bash\n\n");
+ writer.println("echo \"Running testscript for delayed kill\"");
+ writer.println("hello=\"Got SIGTERM\"");
+ writer.println("umask 0");
+ writer.println("trap \"echo $hello >> " + processStartFile + "\" SIGTERM");
+ writer.println("echo \"Writing pid to start file\"");
+ writer.println("echo $$ >> " + processStartFile);
+ writer.println("while true; do\nsleep 1s;\ndone");
+ writer.close();
+ scriptFile.setExecutable(true);
+
+ ContainerLaunchContext containerLaunchContext =
+ recordFactory.newRecordInstance(ContainerLaunchContext.class);
+
+ // ////// Construct the Container-id
+ ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
+ appId.setClusterTimestamp(1);
+ appId.setId(1);
+ ApplicationAttemptId appAttemptId =
+ recordFactory.newRecordInstance(ApplicationAttemptId.class);
+ appAttemptId.setApplicationId(appId);
+ appAttemptId.setAttemptId(1);
+ ContainerId cId =
+ recordFactory.newRecordInstance(ContainerId.class);
+ cId.setApplicationAttemptId(appAttemptId);
+ containerLaunchContext.setContainerId(cId);
+
+ containerLaunchContext.setUser(user);
+
+ // upload the script file so that the container can run it
+ URL resource_alpha =
+ ConverterUtils.getYarnUrlFromPath(localFS
+ .makeQualified(new Path(scriptFile.getAbsolutePath())));
+ LocalResource rsrc_alpha =
+ recordFactory.newRecordInstance(LocalResource.class);
+ rsrc_alpha.setResource(resource_alpha);
+ rsrc_alpha.setSize(-1);
+ rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
+ rsrc_alpha.setType(LocalResourceType.FILE);
+ rsrc_alpha.setTimestamp(scriptFile.lastModified());
+ String destinationFile = "dest_file.sh";
+ Map<String, LocalResource> localResources =
+ new HashMap<String, LocalResource>();
+ localResources.put(destinationFile, rsrc_alpha);
+ containerLaunchContext.setLocalResources(localResources);
+
+ // set up the rest of the container
+ containerLaunchContext.setUser(containerLaunchContext.getUser());
+ List<String> commands = new ArrayList<String>();
+ commands.add(scriptFile.getAbsolutePath());
+ containerLaunchContext.setCommands(commands);
+ containerLaunchContext.setResource(recordFactory
+ .newRecordInstance(Resource.class));
+ containerLaunchContext.getResource().setMemory(1024);
+ StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
+ startRequest.setContainerLaunchContext(containerLaunchContext);
+ containerManager.startContainer(startRequest);
+
+ int timeoutSecs = 0;
+ while (!processStartFile.exists() && timeoutSecs++ < 20) {
+ Thread.sleep(1000);
+ LOG.info("Waiting for process start-file to be created");
+ }
+ Assert.assertTrue("ProcessStartFile doesn't exist!",
+ processStartFile.exists());
+
+ // Now test the stop functionality.
+ StopContainerRequest stopRequest = recordFactory.newRecordInstance(StopContainerRequest.class);
+ stopRequest.setContainerId(cId);
+ containerManager.stopContainer(stopRequest);
+
+ BaseContainerManagerTest.waitForContainerState(containerManager, cId,
+ ContainerState.COMPLETE);
+
+ // container stop sends a sigterm followed by a sigkill
+ GetContainerStatusRequest gcsRequest =
+ recordFactory.newRecordInstance(GetContainerStatusRequest.class);
+ gcsRequest.setContainerId(cId);
+ ContainerStatus containerStatus =
+ containerManager.getContainerStatus(gcsRequest).getStatus();
+ Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
+ containerStatus.getExitStatus());
+
+ // Now verify the contents of the file
+ // Script generates a message when it receives a sigterm
+ // so we look for that
+ BufferedReader reader =
+ new BufferedReader(new FileReader(processStartFile));
+
+ boolean foundSigTermMessage = false;
+ while (true) {
+ String line = reader.readLine();
+ if (line == null) {
+ break;
+ }
+ if (line.contains("SIGTERM")) {
+ foundSigTermMessage = true;
+ break;
+ }
+ }
+ Assert.assertTrue("Did not find sigterm message", foundSigTermMessage);
+ reader.close();
+ }
+
}
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java Thu Oct 27 12:06:08 2011
@@ -262,7 +262,7 @@ public class TestContainersMonitor exten
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
- Assert.assertEquals(ExitCode.KILLED.getExitCode(),
+ Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
String expectedMsgPattern =
"Container \\[pid=" + pid + ",containerID=" + cId
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor?rev=1189713&r1=1189712&r2=1189713&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor (original)
+++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/mock-container-executor Thu Oct 27 12:06:08 2011
@@ -5,6 +5,6 @@ do
done > params.txt
if [[ "$2" == "1" ]];
then
- cd $5;
- exec $6;
+ cd $6;
+ exec $7;
fi;