You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 07:02:12 UTC

svn commit: r1131726 - in /incubator/mesos/trunk/src: lxc_isolation_module.cpp nexus_exec.cpp process_based_isolation_module.cpp process_based_isolation_module.hpp

Author: benh
Date: Sun Jun  5 05:02:12 2011
New Revision: 1131726

URL: http://svn.apache.org/viewvc?rev=1131726&view=rev
Log:
A few more details to take care of in getting processes correctly cleaned up.

Modified:
    incubator/mesos/trunk/src/lxc_isolation_module.cpp
    incubator/mesos/trunk/src/nexus_exec.cpp
    incubator/mesos/trunk/src/process_based_isolation_module.cpp
    incubator/mesos/trunk/src/process_based_isolation_module.hpp

Modified: incubator/mesos/trunk/src/lxc_isolation_module.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/lxc_isolation_module.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/lxc_isolation_module.cpp (original)
+++ incubator/mesos/trunk/src/lxc_isolation_module.cpp Sun Jun  5 05:02:12 2011
@@ -191,6 +191,7 @@ void LxcIsolationModule::Reaper::operato
 	    module->container[fid] = "";
 	    module->lxcExecutePid[fid] = -1;
 	    LOG(INFO) << "Telling slave of lost framework " << fid;
+	    // TODO(benh): This is broken if/when libprocess is parallel!
 	    module->slave->executorExited(fid, status);
 	    break;
 	  }

Modified: incubator/mesos/trunk/src/nexus_exec.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/nexus_exec.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/nexus_exec.cpp (original)
+++ incubator/mesos/trunk/src/nexus_exec.cpp Sun Jun  5 05:02:12 2011
@@ -1,3 +1,5 @@
+#include <signal.h>
+
 #include <cerrno>
 #include <iostream>
 #include <string>
@@ -97,7 +99,13 @@ protected:
         case PROCESS_EXIT: {
           // TODO: Pass an argument to shutdown to tell it this is abnormal?
           invoke(bind(&Executor::shutdown, executor, driver));
-          exit(1);
+
+	  // This is a pretty bad state ... no slave is left. Rather
+	  // than exit lets kill our process group (which includes
+	  // ourself) hoping to clean up any processes this executor
+	  // launched itself.
+	  // TODO(benh): Maybe do a SIGTERM and then later do a SIGKILL?
+	  killpg(getpid(), SIGKILL);
         }
 
         default: {

Modified: incubator/mesos/trunk/src/process_based_isolation_module.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/process_based_isolation_module.cpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/process_based_isolation_module.cpp (original)
+++ incubator/mesos/trunk/src/process_based_isolation_module.cpp Sun Jun  5 05:02:12 2011
@@ -43,14 +43,14 @@ ProcessBasedIsolationModule::~ProcessBas
 
 void ProcessBasedIsolationModule::frameworkAdded(Framework* framework)
 {
-  osPid[framework->id] = -1;
+  pgids[framework->id] = -1;
   framework->executorStatus = "No executor running";
 }
 
 
 void ProcessBasedIsolationModule::frameworkRemoved(Framework* framework)
 {
-  osPid.erase(framework->id);
+  pgids.erase(framework->id);
 }
 
 
@@ -58,16 +58,17 @@ void ProcessBasedIsolationModule::startE
 {
   LOG(INFO) << "Starting executor for framework " << framework->id << ": "
             << framework->executorInfo.uri;
-  CHECK(osPid[framework->id] == -1);
+  CHECK(pgids[framework->id] == -1);
 
   pid_t pid;
   if ((pid = fork()) == -1)
     PLOG(FATAL) << "Failed to fork to launch new executor";
 
   if (pid) {
-    // In parent process, record the pid for killpg later.
+    // In parent process, record the gpid for killpg later (the pid is
+    // the gpid because the child does a setsid below).
     LOG(INFO) << "Started executor, OS pid = " << pid;
-    osPid[framework->id] = pid;
+    pgids[framework->id] = pid;
     framework->executorStatus = "PID: " + lexical_cast<string>(pid);
   } else {
     // In child process, do setsid to make cleanup easier.
@@ -81,13 +82,13 @@ void ProcessBasedIsolationModule::startE
 
 void ProcessBasedIsolationModule::killExecutor(Framework* fw)
 {
-  if (osPid[fw->id] != -1) {
+  if (pgids[fw->id] != -1) {
     // TODO(benh): Consider sending a SIGTERM, then after so much time
     // if it still hasn't exited do a SIGKILL (can use a libprocess
     // process for this).
-    LOG(INFO) << "Sending SIGKILL to gpid " << osPid[fw->id];
-    killpg(osPid[fw->id], SIGKILL);
-    osPid[fw->id] = -1;
+    LOG(INFO) << "Sending SIGKILL to gpid " << pgids[fw->id];
+    killpg(pgids[fw->id], SIGKILL);
+    pgids[fw->id] = -1;
     fw->executorStatus = "No executor running";
     // TODO(benh): Kill all of the process's descendants? Perhaps
     // create a new libprocess process that continually tries to kill
@@ -127,9 +128,14 @@ void ProcessBasedIsolationModule::Reaper
       pid_t pid;
       int status;
       if ((pid = waitpid((pid_t) -1, &status, WNOHANG)) > 0) {
-	foreachpair (FrameworkID fid, pid_t& fwPid, module->osPid) {
-	  if (fwPid == pid) {
-	    module->osPid[fid] = -1;
+	foreachpair (FrameworkID fid, pid_t& pgid, module->pgids) {
+	  if (pgid == pid) {
+	    // Kill the process group to clean up the tasks.
+	    LOG(INFO) << "Sending SIGKILL to gpid " << pgid;
+	    killpg(pgid, SIGKILL);
+	    module->pgids[fid] = -1;
+	    LOG(INFO) << "Telling slave of lost framework " << fid;
+	    // TODO(benh): This is broken if/when libprocess is parallel!
 	    module->slave->executorExited(fid, status);
 	    break;
 	  }

Modified: incubator/mesos/trunk/src/process_based_isolation_module.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/process_based_isolation_module.hpp?rev=1131726&r1=1131725&r2=1131726&view=diff
==============================================================================
--- incubator/mesos/trunk/src/process_based_isolation_module.hpp (original)
+++ incubator/mesos/trunk/src/process_based_isolation_module.hpp Sun Jun  5 05:02:12 2011
@@ -34,7 +34,7 @@ public:
 
 protected:
   Slave* slave;
-  unordered_map<FrameworkID, pid_t> osPid;
+  unordered_map<FrameworkID, pid_t> pgids;
   Reaper* reaper;
 
 public: