You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2014/08/09 01:09:30 UTC

git commit: Retry freeze in cgroups TasksKiller to workaround MESOS-1689.

Repository: mesos
Updated Branches:
  refs/heads/master d376f05fe -> 52cf9b3ff


Retry freeze in cgroups TasksKiller to workaround MESOS-1689.

Review: https://reviews.apache.org/r/24511


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/52cf9b3f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/52cf9b3f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/52cf9b3f

Branch: refs/heads/master
Commit: 52cf9b3ffbbe7648d1b529b5112b1b5b6360eaa5
Parents: d376f05
Author: Jie Yu <yu...@gmail.com>
Authored: Fri Aug 8 14:56:16 2014 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Aug 8 15:44:23 2014 -0700

----------------------------------------------------------------------
 src/linux/cgroups.cpp | 31 ++++++++++++++++++++++++++++++-
 src/linux/cgroups.hpp |  7 +++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/52cf9b3f/src/linux/cgroups.cpp
----------------------------------------------------------------------
diff --git a/src/linux/cgroups.cpp b/src/linux/cgroups.cpp
index 39a4874..47be0ef 100644
--- a/src/linux/cgroups.cpp
+++ b/src/linux/cgroups.cpp
@@ -1470,10 +1470,30 @@ protected:
   virtual void finalize()
   {
     chain.discard();
+
+    // TODO(jieyu): Wait until 'chain' is in DISCARDED state before
+    // discarding 'promise'.
     promise.discard();
   }
 
 private:
+  static Future<Nothing> freezeTimedout(
+      Future<Nothing> future,
+      const PID<TasksKiller>& pid,
+      const string& hierarchy,
+      const string& cgroup)
+  {
+    // Cancel the freeze operation.
+    // TODO(jieyu): Wait until 'future' is in DISCARDED state before
+    // starting retry.
+    future.discard();
+
+    // Thaw the cgroup before trying to freeze again to allow any
+    // pending signals to be delivered. See MESOS-1689 for details.
+    return cgroups::freezer::thaw(hierarchy, cgroup)
+      .then(defer(pid, &Self::freeze));
+  }
+
   void killTasks() {
     // Chain together the steps needed to kill all tasks in the cgroup.
     chain = freeze()                     // Freeze the cgroup.
@@ -1486,7 +1506,16 @@ private:
 
   Future<Nothing> freeze()
   {
-    return cgroups::freezer::freeze(hierarchy, cgroup);
+    // TODO(jieyu): This is a workaround for MESOS-1689. We will move
+    // away from freezer once we have pid namespace support.
+    return cgroups::freezer::freeze(hierarchy, cgroup).after(
+        FREEZE_RETRY_INTERVAL,
+        lambda::bind(
+            &freezeTimedout,
+            lambda::_1,
+            self(),
+            hierarchy,
+            cgroup));
   }
 
   Future<Nothing> kill()

http://git-wip-us.apache.org/repos/asf/mesos/blob/52cf9b3f/src/linux/cgroups.hpp
----------------------------------------------------------------------
diff --git a/src/linux/cgroups.hpp b/src/linux/cgroups.hpp
index 9dfba6e..26dcb3d 100644
--- a/src/linux/cgroups.hpp
+++ b/src/linux/cgroups.hpp
@@ -45,6 +45,13 @@ namespace cgroups {
 // explicitly specified.
 const Duration DESTROY_TIMEOUT = Seconds(60);
 
+
+// Freezing a cgroup may get stuck (see MESOS-1689 for details). To
+// workaround, we may want to thaw the cgroup and retry freezing it.
+// This is the suggested retry interval.
+const Duration FREEZE_RETRY_INTERVAL = Seconds(10);
+
+
 // Default number of assign attempts when moving threads to a cgroup.
 const unsigned int THREAD_ASSIGN_RETRIES = 100;