You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by qi...@apache.org on 2020/08/18 07:43:14 UTC

[mesos] 04/05: Implemented the `recover` method of `volume/csi` isolator.

This is an automated email from the ASF dual-hosted git repository.

qianzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 64cd6b82786de0bde3ddaaf221b5ab9a106c87c7
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Sat Aug 8 23:53:31 2020 +0800

    Implemented the `recover` method of `volume/csi` isolator.
    
    Review: https://reviews.apache.org/r/72753
---
 .../mesos/isolators/volume/csi/isolator.cpp        | 142 +++++++++++++++++++++
 .../mesos/isolators/volume/csi/isolator.hpp        |   2 +
 2 files changed, 144 insertions(+)

diff --git a/src/slave/containerizer/mesos/isolators/volume/csi/isolator.cpp b/src/slave/containerizer/mesos/isolators/volume/csi/isolator.cpp
index d61fe30..535974b 100644
--- a/src/slave/containerizer/mesos/isolators/volume/csi/isolator.cpp
+++ b/src/slave/containerizer/mesos/isolators/volume/csi/isolator.cpp
@@ -14,6 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <list>
 #include <string>
 #include <vector>
 
@@ -35,6 +36,7 @@
 #include "slave/containerizer/mesos/isolators/volume/csi/isolator.hpp"
 #include "slave/containerizer/mesos/isolators/volume/csi/paths.hpp"
 
+using std::list;
 using std::string;
 using std::vector;
 
@@ -104,6 +106,146 @@ Future<Nothing> VolumeCSIIsolatorProcess::recover(
     const vector<ContainerState>& states,
     const hashset<ContainerID>& orphans)
 {
+  foreach (const ContainerState& state, states) {
+    const ContainerID& containerId = state.container_id();
+
+    Try<Nothing> recover = recoverContainer(containerId);
+    if (recover.isError()) {
+      return Failure(
+          "Failed to recover CSI volumes for container " +
+          stringify(containerId) + ": " + recover.error());
+    }
+  }
+
+  // Recover any orphan containers that we might have check pointed.
+  // These orphan containers will be destroyed by the containerizer
+  // through the regular cleanup path. See MESOS-2367 for details.
+  foreach (const ContainerID& containerId, orphans) {
+    Try<Nothing> recover = recoverContainer(containerId);
+    if (recover.isError()) {
+      return Failure(
+          "Failed to recover CSI volumes for orphan container " +
+          stringify(containerId) + ": " + recover.error());
+    }
+  }
+
+  // Walk through all the checkpointed containers to determine if
+  // there are any unknown orphan containers.
+  Try<list<string>> entries = os::ls(rootDir);
+  if (entries.isError()) {
+    return Failure(
+        "Unable to list CSI volume checkpoint directory '" +
+        rootDir + "': " + entries.error());
+  }
+
+  foreach (const string& entry, entries.get()) {
+    ContainerID containerId =
+      protobuf::parseContainerId(Path(entry).basename());
+
+    // Check if this container has already been recovered.
+    if (infos.contains(containerId)) {
+      continue;
+    }
+
+    // An unknown orphan container. Recover it and then clean it up.
+    Try<Nothing> recover = recoverContainer(containerId);
+    if (recover.isError()) {
+      return Failure(
+          "Failed to recover CSI volumes for orphan container " +
+          stringify(containerId) + ": " + recover.error());
+    }
+
+    LOG(INFO) << "Cleaning up CSI volumes for unknown orphaned "
+              << "container " << containerId;
+
+    cleanup(containerId);
+  }
+
+  return Nothing();
+}
+
+
+Try<Nothing> VolumeCSIIsolatorProcess::recoverContainer(
+    const ContainerID& containerId)
+{
+  const string containerDir = csi::paths::getContainerDir(rootDir, containerId);
+  if (!os::exists(containerDir)) {
+    // This may occur in the following cases:
+    //   1. The container has exited and the isolator has removed the
+    //      container directory in '_cleanup()' but agent dies before
+    //      noticing this.
+    //   2. Agent dies before the isolator checkpoints CSI volumes for
+    //      the container in 'prepare()'.
+    // For the above cases, we do not need to do anything since there
+    // is nothing to clean up for this container after agent restarts.
+    return Nothing();
+  }
+
+  const string volumesPath = csi::paths::getVolumesPath(rootDir, containerId);
+  if (!os::exists(volumesPath)) {
+    // This may occur if agent dies after creating the container directory
+    // but before it checkpoints anything in it.
+    LOG(WARNING) << "The CSI volumes checkpoint file expected at '"
+                 << volumesPath << "' for container " << containerId
+                 << " does not exist";
+
+    // Construct an info object with empty CSI volumes since no CSI volumes
+    // are mounted yet for this container, and this container will be cleaned
+    // up by containerizer (as known orphan container) or by `recover` (as
+    // unknown orphan container).
+    infos.put(containerId, Owned<Info>(new Info(hashset<CSIVolume>())));
+
+    return Nothing();
+  }
+
+  Result<string> read = state::read<string>(volumesPath);
+  if (read.isError()) {
+    return Error(
+        "Failed to read the CSI volumes checkpoint file '" +
+        volumesPath + "': " + read.error());
+  }
+
+  if (read->empty()) {
+    // This could happen if agent is hard rebooted after the checkpoint file is
+    // created but before the data is synced on disk.
+    LOG(WARNING) << "The CSI volumes checkpointed at '" << volumesPath
+                 << "' for container " << containerId << " is empty";
+
+    // Construct an info object with empty CSI volumes since no CSI volumes
+    // are mounted yet for this container, and this container will be cleaned
+    // up by containerizer (as known orphan container) or by `recover` (as
+    // unknown orphan container).
+    infos.put(containerId, Owned<Info>(new Info(hashset<CSIVolume>())));
+
+    return Nothing();
+  }
+
+  Try<JSON::Object> json = JSON::parse<JSON::Object>(read.get());
+  if (json.isError()) {
+    return Error("JSON parse failed: " + json.error());
+  }
+
+  Try<CSIVolumes> parse = ::protobuf::parse<CSIVolumes>(json.get());
+  if (parse.isError()) {
+    return Error("Protobuf parse failed: " + parse.error());
+  }
+
+  hashset<CSIVolume> volumes;
+  foreach (const CSIVolume& volume, parse->volumes()) {
+    VLOG(1) << "Recovering CSI volume with plugin '" << volume.plugin_name()
+            << "' and ID '" << volume.id() << "' for container " << containerId;
+
+    if (volumes.contains(volume)) {
+      return Error(
+          "Duplicate CSI volume with plugin '" + volume.plugin_name() +
+          "' and ID '" + volume.id() + "'");
+    }
+
+    volumes.insert(volume);
+  }
+
+  infos.put(containerId, Owned<Info>(new Info(volumes)));
+
   return Nothing();
 }
 
diff --git a/src/slave/containerizer/mesos/isolators/volume/csi/isolator.hpp b/src/slave/containerizer/mesos/isolators/volume/csi/isolator.hpp
index e05a7b8..373b629 100644
--- a/src/slave/containerizer/mesos/isolators/volume/csi/isolator.hpp
+++ b/src/slave/containerizer/mesos/isolators/volume/csi/isolator.hpp
@@ -98,6 +98,8 @@ private:
       const ContainerID& containerId,
       const std::vector<process::Future<Nothing>>& futures);
 
+  Try<Nothing> recoverContainer(const ContainerID& containerId);
+
   const Flags flags;
   CSIServer* csiServer;