You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 07:02:12 UTC
svn commit: r1131726 - in /incubator/mesos/trunk/src:
lxc_isolation_module.cpp nexus_exec.cpp process_based_isolation_module.cpp
process_based_isolation_module.hpp
Author: benh
Date: Sun Jun 5 05:02:12 2011
New Revision: 1131726
URL: http://svn.apache.org/viewvc?rev=1131726&view=rev
Log:
A few more details to take care of in getting processes correctly cleaned up.
Modified:
incubator/mesos/trunk/src/lxc_isolation_module.cpp
incubator/mesos/trunk/src/nexus_exec.cpp
incubator/mesos/trunk/src/process_based_isolation_module.cpp
incubator/mesos/trunk/src/process_based_isolation_module.hpp
Modified: incubator/mesos/trunk/src/lxc_isolation_module.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/lxc_isolation_module.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/lxc_isolation_module.cpp (original)
+++ incubator/mesos/trunk/src/lxc_isolation_module.cpp Sun Jun 5 05:02:12 2011
@@ -191,6 +191,7 @@ void LxcIsolationModule::Reaper::operato
module->container[fid] = "";
module->lxcExecutePid[fid] = -1;
LOG(INFO) << "Telling slave of lost framework " << fid;
+ // TODO(benh): This is broken if/when libprocess is parallel!
module->slave->executorExited(fid, status);
break;
}
Modified: incubator/mesos/trunk/src/nexus_exec.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/nexus_exec.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/nexus_exec.cpp (original)
+++ incubator/mesos/trunk/src/nexus_exec.cpp Sun Jun 5 05:02:12 2011
@@ -1,3 +1,5 @@
+#include <signal.h>
+
#include <cerrno>
#include <iostream>
#include <string>
@@ -97,7 +99,13 @@ protected:
case PROCESS_EXIT: {
// TODO: Pass an argument to shutdown to tell it this is abnormal?
invoke(bind(&Executor::shutdown, executor, driver));
- exit(1);
+
+ // This is a pretty bad state ... no slave is left. Rather
+ // than exit lets kill our process group (which includes
+ // ourself) hoping to clean up any processes this executor
+ // launched itself.
+ // TODO(benh): Maybe do a SIGTERM and then later do a SIGKILL?
+ killpg(getpid(), SIGKILL);
}
default: {
Modified: incubator/mesos/trunk/src/process_based_isolation_module.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/process_based_isolation_module.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/process_based_isolation_module.cpp (original)
+++ incubator/mesos/trunk/src/process_based_isolation_module.cpp Sun Jun 5 05:02:12 2011
@@ -43,14 +43,14 @@ ProcessBasedIsolationModule::~ProcessBas
void ProcessBasedIsolationModule::frameworkAdded(Framework* framework)
{
- osPid[framework->id] = -1;
+ pgids[framework->id] = -1;
framework->executorStatus = "No executor running";
}
void ProcessBasedIsolationModule::frameworkRemoved(Framework* framework)
{
- osPid.erase(framework->id);
+ pgids.erase(framework->id);
}
@@ -58,16 +58,17 @@ void ProcessBasedIsolationModule::startE
{
LOG(INFO) << "Starting executor for framework " << framework->id << ": "
<< framework->executorInfo.uri;
- CHECK(osPid[framework->id] == -1);
+ CHECK(pgids[framework->id] == -1);
pid_t pid;
if ((pid = fork()) == -1)
PLOG(FATAL) << "Failed to fork to launch new executor";
if (pid) {
- // In parent process, record the pid for killpg later.
+ // In parent process, record the gpid for killpg later (the pid is
+ // the gpid because the child does a setsid below).
LOG(INFO) << "Started executor, OS pid = " << pid;
- osPid[framework->id] = pid;
+ pgids[framework->id] = pid;
framework->executorStatus = "PID: " + lexical_cast<string>(pid);
} else {
// In child process, do setsid to make cleanup easier.
@@ -81,13 +82,13 @@ void ProcessBasedIsolationModule::startE
void ProcessBasedIsolationModule::killExecutor(Framework* fw)
{
- if (osPid[fw->id] != -1) {
+ if (pgids[fw->id] != -1) {
// TODO(benh): Consider sending a SIGTERM, then after so much time
// if it still hasn't exited do a SIGKILL (can use a libprocess
// process for this).
- LOG(INFO) << "Sending SIGKILL to gpid " << osPid[fw->id];
- killpg(osPid[fw->id], SIGKILL);
- osPid[fw->id] = -1;
+ LOG(INFO) << "Sending SIGKILL to gpid " << pgids[fw->id];
+ killpg(pgids[fw->id], SIGKILL);
+ pgids[fw->id] = -1;
fw->executorStatus = "No executor running";
// TODO(benh): Kill all of the process's descendants? Perhaps
// create a new libprocess process that continually tries to kill
@@ -127,9 +128,14 @@ void ProcessBasedIsolationModule::Reaper
pid_t pid;
int status;
if ((pid = waitpid((pid_t) -1, &status, WNOHANG)) > 0) {
- foreachpair (FrameworkID fid, pid_t& fwPid, module->osPid) {
- if (fwPid == pid) {
- module->osPid[fid] = -1;
+ foreachpair (FrameworkID fid, pid_t& pgid, module->pgids) {
+ if (pgid == pid) {
+ // Kill the process group to clean up the tasks.
+ LOG(INFO) << "Sending SIGKILL to gpid " << pgid;
+ killpg(pgid, SIGKILL);
+ module->pgids[fid] = -1;
+ LOG(INFO) << "Telling slave of lost framework " << fid;
+ // TODO(benh): This is broken if/when libprocess is parallel!
module->slave->executorExited(fid, status);
break;
}
Modified: incubator/mesos/trunk/src/process_based_isolation_module.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/process_based_isolation_module.hpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/process_based_isolation_module.hpp (original)
+++ incubator/mesos/trunk/src/process_based_isolation_module.hpp Sun Jun 5 05:02:12 2011
@@ -34,7 +34,7 @@ public:
protected:
Slave* slave;
- unordered_map<FrameworkID, pid_t> osPid;
+ unordered_map<FrameworkID, pid_t> pgids;
Reaper* reaper;
public: