You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2016/08/12 04:37:27 UTC
mesos git commit: Implemented `CgroupsIsolatorProcess::recover`.
Repository: mesos
Updated Branches:
refs/heads/master 00c98ca2b -> a12b21278
Implemented `CgroupsIsolatorProcess::recover`.
Review: https://reviews.apache.org/r/49817/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a12b2127
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a12b2127
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a12b2127
Branch: refs/heads/master
Commit: a12b21278445a08de8cbe587d1274f58eeb4fc0c
Parents: 00c98ca
Author: haosdent huang <ha...@gmail.com>
Authored: Thu Aug 11 16:58:45 2016 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Thu Aug 11 21:34:22 2016 -0700
----------------------------------------------------------------------
.../mesos/isolators/cgroups/cgroups.cpp | 193 ++++++++++++++++++-
.../mesos/isolators/cgroups/cgroups.hpp | 15 ++
2 files changed, 207 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/a12b2127/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index 2659252..7e205a3 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -147,7 +147,198 @@ Future<Nothing> CgroupsIsolatorProcess::recover(
const list<ContainerState>& states,
const hashset<ContainerID>& orphans)
{
- return Failure("Not implemented.");
+ // Recover active containers first.
+ list<Future<Nothing>> recovers;
+ foreach (const ContainerState& state, states) {
+ recovers.push_back(___recover(state.container_id()));
+ }
+
+ return await(recovers)
+ .then(defer(
+ PID<CgroupsIsolatorProcess>(this),
+ &CgroupsIsolatorProcess::_recover,
+ orphans,
+ lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::_recover(
+ const hashset<ContainerID>& orphans,
+ const list<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back((future.isFailed()
+ ? future.failure()
+ : "discarded"));
+ }
+ }
+
+ if (errors.size() > 0) {
+ return Failure(
+ "Failed to recover active containers: " +
+ strings::join(";", errors));
+ }
+
+ hashset<ContainerID> knownOrphans;
+ hashset<ContainerID> unknownOrphans;
+
+ foreach (const string& hierarchy, subsystems.keys()) {
+ // TODO(jieyu): Use non-recursive version of `cgroups::get`.
+ Try<vector<string>> cgroups = cgroups::get(
+ hierarchy,
+ flags.cgroups_root);
+
+ if (cgroups.isError()) {
+ return Failure(
+ "Failed to list cgroups under '" + hierarchy + "': " +
+ cgroups.error());
+ }
+
+ foreach (const string& cgroup, cgroups.get()) {
+ // Ignore the slave cgroup (see the --slave_subsystems flag).
+ // TODO(idownes): Remove this when the cgroups layout is
+ // updated, see MESOS-1185.
+ if (cgroup == path::join(flags.cgroups_root, "slave")) {
+ continue;
+ }
+
+ ContainerID containerId;
+ containerId.set_value(Path(cgroup).basename());
+
+ // Skip containerId which already have been recovered.
+ if (infos.contains(containerId)) {
+ continue;
+ }
+
+ if (orphans.contains(containerId)) {
+ knownOrphans.insert(containerId);
+ } else {
+ unknownOrphans.insert(containerId);
+ }
+ }
+ }
+
+ list<Future<Nothing>> recovers;
+
+ foreach (const ContainerID& containerId, knownOrphans) {
+ recovers.push_back(___recover(containerId));
+ }
+
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ recovers.push_back(___recover(containerId));
+ }
+
+ return await(recovers)
+ .then(defer(
+ PID<CgroupsIsolatorProcess>(this),
+ &CgroupsIsolatorProcess::__recover,
+ unknownOrphans,
+ lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::__recover(
+ const hashset<ContainerID>& unknownOrphans,
+ const list<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back((future.isFailed()
+ ? future.failure()
+ : "discarded"));
+ }
+ }
+
+ if (errors.size() > 0) {
+ return Failure(
+ "Failed to recover orphan containers: " +
+ strings::join(";", errors));
+ }
+
+ // Known orphan cgroups will be destroyed by the containerizer using
+ // the normal cleanup path. See MESOS-2367 for details.
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ LOG(INFO) << "Cleaning up unknown orphaned container " << containerId;
+ cleanup(containerId);
+ }
+
+ return Nothing();
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::___recover(
+ const ContainerID& containerId)
+{
+ const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+ list<Future<Nothing>> recovers;
+
+ // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
+ foreach (const string& hierarchy, subsystems.keys()) {
+ Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+ if (exists.isError()) {
+ return Failure(
+ "Failed to check the existence of the cgroup "
+ "'" + cgroup + "' in hierarchy '" + hierarchy + "' "
+ "for container " + stringify(containerId) +
+ ": " + exists.error());
+ }
+
+ if (!exists.get()) {
+ // This may occur if the executor has exited and the isolator
+ // has destroyed the cgroup but the agent dies before noticing
+ // this. This will be detected when the containerizer tries to
+ // monitor the executor's pid.
+ LOG(WARNING) << "Couldn't find the cgroup '" << cgroup << "' "
+ << "in hierarchy '" << hierarchy << "' "
+ << "for container " << containerId;
+
+ continue;
+ }
+
+ foreach (const Owned<Subsystem>& subsystem, subsystems.get(hierarchy)) {
+ recovers.push_back(subsystem->recover(containerId));
+ }
+ }
+
+ return await(recovers)
+ .then(defer(
+ PID<CgroupsIsolatorProcess>(this),
+ &CgroupsIsolatorProcess::____recover,
+ containerId,
+ lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::____recover(
+ const ContainerID& containerId,
+ const list<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back((future.isFailed()
+ ? future.failure()
+ : "discarded"));
+ }
+ }
+
+ if (errors.size() > 0) {
+ return Failure(
+ "Failed to recover subsystems: " +
+ strings::join(";", errors));
+ }
+
+ CHECK(!infos.contains(containerId));
+
+ infos[containerId] = Owned<Info>(new Info(
+ containerId,
+ path::join(flags.cgroups_root, containerId.value())));
+
+ return Nothing();
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/a12b2127/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
index b191b2a..9b2d33e 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
@@ -102,6 +102,21 @@ private:
const hashmap<std::string, std::string>& _hierarchies,
const multihashmap<std::string, process::Owned<Subsystem>>& _subsystems);
+ process::Future<Nothing> _recover(
+ const hashset<ContainerID>& orphans,
+ const std::list<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> __recover(
+ const hashset<ContainerID>& unknownOrphans,
+ const std::list<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> ___recover(
+ const ContainerID& containerId);
+
+ process::Future<Nothing> ____recover(
+ const ContainerID& containerId,
+ const std::list<process::Future<Nothing>>& futures);
+
process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig,