You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2019/01/08 00:17:50 UTC

[mesos] branch 1.6.x updated: Sent SIGKILL to I/O switchboard server as a safeguard.

This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.6.x by this push:
     new a092c2c  Sent SIGKILL to I/O switchboard server as a safeguard.
a092c2c is described below

commit a092c2c9e13914b248e07749c107ed65f088af79
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Mon Jan 7 16:16:12 2019 -0800

    Sent SIGKILL to I/O switchboard server as a safeguard.
    
    Review: https://reviews.apache.org/r/69667/
    (cherry picked from commit 3478e344fb77d931f6122980c6e94cd3913c441d)
---
 src/slave/containerizer/mesos/io/switchboard.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp
index d96a4de..ceb2140 100644
--- a/src/slave/containerizer/mesos/io/switchboard.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard.cpp
@@ -819,6 +819,23 @@ Future<Nothing> IOSwitchboard::cleanup(
                   << " is being destroyed";
 
         os::kill(pid.get(), SIGTERM);
+
+        Clock::timer(Seconds(60), [pid, status, containerId]() {
+          if (status.isPending()) {
+            // If we are here, something really bad must have happened for I/O
+            // switchboard server to not exit after SIGTERM has been sent. We
+            // have seen this happen due to FD leak (see MESOS-9502). We do a
+            // SIGKILL here as a safeguard so that switchboard server forcefully
+            // exits and causes this cleanup feature to be completed, thus
+            // unblocking the container's cleanup.
+            LOG(ERROR) << "Sending SIGKILL to I/O switchboard server (pid: "
+                       << pid.get() << ") for container " << containerId
+                       << " since the I/O switchboard server did not terminate "
+                       << "60 seconds after SIGTERM was sent to it";
+
+            os::kill(pid.get(), SIGKILL);
+          }
+        });
       }
     });
   }