You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2016/08/12 04:37:27 UTC

mesos git commit: Implemented `CgroupsIsolatorProcess::recover`.

Repository: mesos
Updated Branches:
  refs/heads/master 00c98ca2b -> a12b21278


Implemented `CgroupsIsolatorProcess::recover`.

Review: https://reviews.apache.org/r/49817/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a12b2127
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a12b2127
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a12b2127

Branch: refs/heads/master
Commit: a12b21278445a08de8cbe587d1274f58eeb4fc0c
Parents: 00c98ca
Author: haosdent huang <ha...@gmail.com>
Authored: Thu Aug 11 16:58:45 2016 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Thu Aug 11 21:34:22 2016 -0700

----------------------------------------------------------------------
 .../mesos/isolators/cgroups/cgroups.cpp         | 193 ++++++++++++++++++-
 .../mesos/isolators/cgroups/cgroups.hpp         |  15 ++
 2 files changed, 207 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a12b2127/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index 2659252..7e205a3 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -147,7 +147,198 @@ Future<Nothing> CgroupsIsolatorProcess::recover(
     const list<ContainerState>& states,
     const hashset<ContainerID>& orphans)
 {
-  return Failure("Not implemented.");
+  // Recover active containers first.
+  list<Future<Nothing>> recovers;
+  foreach (const ContainerState& state, states) {
+    recovers.push_back(___recover(state.container_id()));
+  }
+
+  return await(recovers)
+    .then(defer(
+        PID<CgroupsIsolatorProcess>(this),
+        &CgroupsIsolatorProcess::_recover,
+        orphans,
+        lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::_recover(
+    const hashset<ContainerID>& orphans,
+    const list<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back((future.isFailed()
+          ? future.failure()
+          : "discarded"));
+    }
+  }
+
+  if (errors.size() > 0) {
+    return Failure(
+        "Failed to recover active containers: " +
+        strings::join(";", errors));
+  }
+
+  hashset<ContainerID> knownOrphans;
+  hashset<ContainerID> unknownOrphans;
+
+  foreach (const string& hierarchy, subsystems.keys()) {
+    // TODO(jieyu): Use non-recursive version of `cgroups::get`.
+    Try<vector<string>> cgroups = cgroups::get(
+        hierarchy,
+        flags.cgroups_root);
+
+    if (cgroups.isError()) {
+      return Failure(
+          "Failed to list cgroups under '" + hierarchy + "': " +
+          cgroups.error());
+    }
+
+    foreach (const string& cgroup, cgroups.get()) {
+      // Ignore the slave cgroup (see the --slave_subsystems flag).
+      // TODO(idownes): Remove this when the cgroups layout is
+      // updated, see MESOS-1185.
+      if (cgroup == path::join(flags.cgroups_root, "slave")) {
+        continue;
+      }
+
+      ContainerID containerId;
+      containerId.set_value(Path(cgroup).basename());
+
+      // Skip containerId which already have been recovered.
+      if (infos.contains(containerId)) {
+        continue;
+      }
+
+      if (orphans.contains(containerId)) {
+        knownOrphans.insert(containerId);
+      } else {
+        unknownOrphans.insert(containerId);
+      }
+    }
+  }
+
+  list<Future<Nothing>> recovers;
+
+  foreach (const ContainerID& containerId, knownOrphans) {
+    recovers.push_back(___recover(containerId));
+  }
+
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    recovers.push_back(___recover(containerId));
+  }
+
+  return await(recovers)
+    .then(defer(
+        PID<CgroupsIsolatorProcess>(this),
+        &CgroupsIsolatorProcess::__recover,
+        unknownOrphans,
+        lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::__recover(
+    const hashset<ContainerID>& unknownOrphans,
+    const list<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back((future.isFailed()
+          ? future.failure()
+          : "discarded"));
+    }
+  }
+
+  if (errors.size() > 0) {
+    return Failure(
+        "Failed to recover orphan containers: " +
+        strings::join(";", errors));
+  }
+
+  // Known orphan cgroups will be destroyed by the containerizer using
+  // the normal cleanup path. See MESOS-2367 for details.
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    LOG(INFO) << "Cleaning up unknown orphaned container " << containerId;
+    cleanup(containerId);
+  }
+
+  return Nothing();
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::___recover(
+    const ContainerID& containerId)
+{
+  const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+  list<Future<Nothing>> recovers;
+
+  // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
+  foreach (const string& hierarchy, subsystems.keys()) {
+    Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+    if (exists.isError()) {
+      return Failure(
+          "Failed to check the existence of the cgroup "
+          "'" + cgroup + "' in hierarchy '" + hierarchy + "' "
+          "for container " + stringify(containerId) +
+          ": " + exists.error());
+    }
+
+    if (!exists.get()) {
+      // This may occur if the executor has exited and the isolator
+      // has destroyed the cgroup but the agent dies before noticing
+      // this. This will be detected when the containerizer tries to
+      // monitor the executor's pid.
+      LOG(WARNING) << "Couldn't find the cgroup '" << cgroup << "' "
+                   << "in hierarchy '" << hierarchy << "' "
+                   << "for container " << containerId;
+
+      continue;
+    }
+
+    foreach (const Owned<Subsystem>& subsystem, subsystems.get(hierarchy)) {
+      recovers.push_back(subsystem->recover(containerId));
+    }
+  }
+
+  return await(recovers)
+    .then(defer(
+        PID<CgroupsIsolatorProcess>(this),
+        &CgroupsIsolatorProcess::____recover,
+        containerId,
+        lambda::_1));
+}
+
+
+Future<Nothing> CgroupsIsolatorProcess::____recover(
+    const ContainerID& containerId,
+    const list<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back((future.isFailed()
+          ? future.failure()
+          : "discarded"));
+    }
+  }
+
+  if (errors.size() > 0) {
+    return Failure(
+        "Failed to recover subsystems: " +
+        strings::join(";", errors));
+  }
+
+  CHECK(!infos.contains(containerId));
+
+  infos[containerId] = Owned<Info>(new Info(
+      containerId,
+      path::join(flags.cgroups_root, containerId.value())));
+
+  return Nothing();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/a12b2127/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
index b191b2a..9b2d33e 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
@@ -102,6 +102,21 @@ private:
       const hashmap<std::string, std::string>& _hierarchies,
       const multihashmap<std::string, process::Owned<Subsystem>>& _subsystems);
 
+  process::Future<Nothing> _recover(
+      const hashset<ContainerID>& orphans,
+      const std::list<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> __recover(
+      const hashset<ContainerID>& unknownOrphans,
+      const std::list<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> ___recover(
+    const ContainerID& containerId);
+
+  process::Future<Nothing> ____recover(
+    const ContainerID& containerId,
+    const std::list<process::Future<Nothing>>& futures);
+
   process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
       const ContainerID& containerId,
       const mesos::slave::ContainerConfig& containerConfig,