You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2012/10/31 17:39:36 UTC
svn commit: r1404243 - in /incubator/mesos/trunk/src/linux: cgroups.cpp
cgroups.hpp proc.hpp
Author: benh
Date: Wed Oct 31 16:39:35 2012
New Revision: 1404243
URL: http://svn.apache.org/viewvc?rev=1404243&view=rev
Log:
Added a retry limit on cgroups freezing attempts.
From: Ben Mahler <be...@gmail.com>
Review: https://reviews.apache.org/r/7712
Modified:
incubator/mesos/trunk/src/linux/cgroups.cpp
incubator/mesos/trunk/src/linux/cgroups.hpp
incubator/mesos/trunk/src/linux/proc.hpp
Modified: incubator/mesos/trunk/src/linux/cgroups.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/linux/cgroups.cpp?rev=1404243&r1=1404242&r2=1404243&view=diff
==============================================================================
--- incubator/mesos/trunk/src/linux/cgroups.cpp (original)
+++ incubator/mesos/trunk/src/linux/cgroups.cpp Wed Oct 31 16:39:35 2012
@@ -1070,14 +1070,16 @@ namespace internal {
class Freezer : public Process<Freezer>
{
public:
- Freezer(const string& _hierarchy,
- const string& _cgroup,
- const string& _action,
- const Duration& _interval)
+ Freezer(const std::string& _hierarchy,
+ const std::string& _cgroup,
+ const std::string& _action,
+ const Duration& _interval,
+ unsigned int _retries = FREEZE_RETRIES)
: hierarchy(_hierarchy),
cgroup(_cgroup),
action(_action),
- interval(_interval) {}
+ interval(_interval),
+ retries(_retries) {}
virtual ~Freezer() {}
@@ -1139,10 +1141,8 @@ private:
}
}
- void watchFrozen()
+ void watchFrozen(unsigned int attempt = 0)
{
- LOG(INFO) << "Checking frozen status of cgroup '" << cgroup << "'";
-
Try<string> state =
internal::readControl(hierarchy, cgroup, "freezer.state");
@@ -1153,7 +1153,8 @@ private:
}
if (strings::trim(state.get()) == "FROZEN") {
- LOG(INFO) << "Successfully froze cgroup '" << cgroup << "'";
+ LOG(INFO) << "Successfully froze cgroup '" << cgroup << "' after "
+ << attempt + 1 << " attempts.";
promise.set(true);
terminate(self());
} else if (strings::trim(state.get()) == "FREEZING") {
@@ -1195,7 +1196,13 @@ private:
}
}
- LOG(INFO) << "Retrying to freeze cgroup '" << cgroup << "'";
+ if (attempt > retries) {
+ LOG(WARNING) << "Could not freeze cgroup '" << cgroup << "' within "
+ << retries + 1 << " attempts";
+ promise.set(false);
+ terminate(self());
+ return;
+ }
Try<Nothing> result =
internal::writeControl(hierarchy, cgroup, "freezer.state", "FROZEN");
@@ -1207,7 +1214,7 @@ private:
}
// Not done yet, keep watching (and possibly retrying).
- delay(interval, self(), &Freezer::watchFrozen);
+ delay(interval, self(), &Freezer::watchFrozen, attempt + 1);
} else {
LOG(FATAL) << "Unexpected state: " << strings::trim(state.get());
}
@@ -1215,8 +1222,6 @@ private:
void watchThawed()
{
- LOG(INFO) << "Checking thaw status of cgroup '" << cgroup << "'";
-
Try<string> state =
internal::readControl(hierarchy, cgroup, "freezer.state");
@@ -1242,6 +1247,7 @@ private:
const string cgroup;
const string action;
const Duration interval;
+ const unsigned int retries;
Promise<bool> promise;
};
@@ -1250,9 +1256,10 @@ private:
Future<bool> freezeCgroup(
- const string& hierarchy,
- const string& cgroup,
- const Duration& interval)
+ const std::string& hierarchy,
+ const std::string& cgroup,
+ const Duration& interval,
+ unsigned int retries)
{
Try<Nothing> check = checkControl(hierarchy, cgroup, "freezer.state");
if (check.isError()) {
@@ -1271,7 +1278,7 @@ Future<bool> freezeCgroup(
}
internal::Freezer* freezer =
- new internal::Freezer(hierarchy, cgroup, "FREEZE", interval);
+ new internal::Freezer(hierarchy, cgroup, "FREEZE", interval, retries);
Future<bool> future = freezer->future();
spawn(freezer, true);
return future;
@@ -1409,7 +1416,7 @@ protected:
.then(funcFreeze) // Freeze the cgroup.
.then(funcKill) // Send kill signals to all tasks in the cgroup.
.then(funcThaw) // Thaw the cgroup to let kill signals be received.
- .then(funcEmpty); // Wait until no task in the cgroup.
+ .then(funcEmpty); // Wait until no task is in the cgroup.
finish.onAny(defer(self(), &Self::finished, lambda::_1));
}
@@ -1423,9 +1430,25 @@ protected:
}
private:
+ // TODO(bmahler): Temporary abort for failed freezes.
+ // TODO(bmahler): Remove this const when libprocess is fixed.
+ // This will be removed by another review in this review chain in favor
+ // of appropriate kill action against the cgroup.
+ Future<bool> checkFreeze(const bool& result)
+ {
+ CHECK(result == true)
+ << "Freezing failed for cgroup: " << cgroup
+ << " of hierarchy: " << hierarchy;
+ return result;
+ }
+
Future<bool> freeze()
{
- return freezeCgroup(hierarchy, cgroup, interval);
+ lambda::function<Future<bool>(const bool&)> checkFreeze =
+ defer(self(), &Self::checkFreeze, std::tr1::placeholders::_1);
+
+ return freezeCgroup(hierarchy, cgroup, interval)
+ .then(checkFreeze);
}
Future<bool> kill()
Modified: incubator/mesos/trunk/src/linux/cgroups.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/linux/cgroups.hpp?rev=1404243&r1=1404242&r2=1404243&view=diff
==============================================================================
--- incubator/mesos/trunk/src/linux/cgroups.hpp (original)
+++ incubator/mesos/trunk/src/linux/cgroups.hpp Wed Oct 31 16:39:35 2012
@@ -33,6 +33,9 @@
namespace cgroups {
+// Default number of retry attempts when trying to freeze a cgroup.
+const unsigned int FREEZE_RETRIES = 50;
+
// We use the following notations throughout the cgroups code. The notations
// here are derived from the kernel documentation. More details can be found in
@@ -269,11 +272,15 @@ process::Future<uint64_t> listenEvent(co
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @param interval The time interval between two state check
// requests (default: 0.1 seconds).
-// @return A future which will become ready when all processes are frozen.
+// @param retries Number of retry attempts before giving up. None
+// indicates infinite retries. (default: 50 attempts).
+// @return A future which will become true when all processes are frozen, or
+// false when all retries have occurred unsuccessfully.
// Error if some unexpected happens.
process::Future<bool> freezeCgroup(const std::string& hierarchy,
const std::string& cgroup,
- const Duration& interval = Seconds(0.1));
+ const Duration& interval = Seconds(0.1),
+ const unsigned int retries = FREEZE_RETRIES);
// Thaw the given cgroup. This is a revert operation of freezeCgroup. It will
Modified: incubator/mesos/trunk/src/linux/proc.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/linux/proc.hpp?rev=1404243&r1=1404242&r2=1404243&view=diff
==============================================================================
--- incubator/mesos/trunk/src/linux/proc.hpp (original)
+++ incubator/mesos/trunk/src/linux/proc.hpp Wed Oct 31 16:39:35 2012
@@ -58,6 +58,8 @@ struct SystemStatistics
// Snapshot of a process (modeled after /proc/[pid]/stat).
+// For more information, see:
+// http://www.kernel.org/doc/Documentation/filesystems/proc.txt
struct ProcessStatistics
{
ProcessStatistics(