You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/03/13 07:24:06 UTC

svn commit: r1455818 - in /incubator/mesos/trunk: src/common/type_utils.hpp src/slave/paths.hpp src/slave/slave.cpp src/slave/slave.hpp third_party/libprocess/third_party/stout/include/stout/os.hpp

Author: vinodkone
Date: Wed Mar 13 06:24:06 2013
New Revision: 1455818

URL: http://svn.apache.org/r1455818
Log:
Added support for incompatible upgrade.

Review: https://reviews.apache.org/r/8763

Modified:
    incubator/mesos/trunk/src/common/type_utils.hpp
    incubator/mesos/trunk/src/slave/paths.hpp
    incubator/mesos/trunk/src/slave/slave.cpp
    incubator/mesos/trunk/src/slave/slave.hpp
    incubator/mesos/trunk/third_party/libprocess/third_party/stout/include/stout/os.hpp

Modified: incubator/mesos/trunk/src/common/type_utils.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/common/type_utils.hpp?rev=1455818&r1=1455817&r2=1455818&view=diff
==============================================================================
--- incubator/mesos/trunk/src/common/type_utils.hpp (original)
+++ incubator/mesos/trunk/src/common/type_utils.hpp Wed Mar 13 06:24:06 2013
@@ -25,6 +25,7 @@
 
 #include <boost/functional/hash.hpp>
 
+#include "common/attributes.hpp"
 #include "common/resources.hpp"
 
 #include "messages/messages.hpp"
@@ -80,6 +81,12 @@ inline std::ostream& operator << (std::o
 }
 
 
+inline std::ostream& operator << (std::ostream& stream, const SlaveInfo& slave)
+{
+  return stream << slave.DebugString();
+}
+
+
 inline bool operator == (const FrameworkID& left, const FrameworkID& right)
 {
   return left.value() == right.value();
@@ -255,6 +262,21 @@ inline bool operator == (const ExecutorI
 }
 
 
+inline bool operator == (const SlaveInfo& left, const SlaveInfo& right)
+{
+  return left.hostname() == right.hostname() &&
+    left.webui_hostname() == right.webui_hostname() &&
+    internal::Resources(left.resources()) ==
+    internal::Resources(right.resources()) &&
+    internal::Attributes(left.attributes()) ==
+    internal::Attributes(right.attributes()) &&
+    left.has_webui_port() == right.has_webui_port() &&
+    (!left.has_webui_port() || (left.webui_port() == right.webui_port())) &&
+    left.has_id() == right.has_id() &&
+    (!left.has_id() || (left.id() == right.id()));
+}
+
+
 inline std::size_t hash_value(const FrameworkID& frameworkId)
 {
   size_t seed = 0;

Modified: incubator/mesos/trunk/src/slave/paths.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/slave/paths.hpp?rev=1455818&r1=1455817&r2=1455818&view=diff
==============================================================================
--- incubator/mesos/trunk/src/slave/paths.hpp (original)
+++ incubator/mesos/trunk/src/slave/paths.hpp Wed Mar 13 06:24:06 2013
@@ -81,6 +81,12 @@ inline std::string getMetaRootDir(const 
 }
 
 
+inline std::string getArchiveDir(const std::string rootDir)
+{
+  return path::join(rootDir, "archive");
+}
+
+
 inline std::string getLatestSlavePath(const std::string& rootDir)
 {
   return strings::format(LATEST_SLAVE_PATH, rootDir).get();

Modified: incubator/mesos/trunk/src/slave/slave.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/slave/slave.cpp?rev=1455818&r1=1455817&r2=1455818&view=diff
==============================================================================
--- incubator/mesos/trunk/src/slave/slave.cpp (original)
+++ incubator/mesos/trunk/src/slave/slave.cpp Wed Mar 13 06:24:06 2013
@@ -996,14 +996,6 @@ void Slave::_statusUpdateAcknowledgement
         << future.get().error();
     return;
   }
-
-  // If this slave is in 'recover=cleanup' mode, exit after all executors
-  // have exited.
-  if (flags.recover == "cleanup" && frameworks.empty()) {
-    LOG(INFO) << "Slave is shutting down because it was started in cleanup "
-              << " recovery mode and all updates have been acknowledged!";
-    shutdown();
-  }
 }
 
 
@@ -1550,6 +1542,51 @@ void Slave::executorTerminated(
     // Pass ownership of the framework pointer.
     completedFrameworks.push_back(std::tr1::shared_ptr<Framework>(framework));
   }
+
+  // If this slave is in 'recover=cleanup' mode, exit after all executors
+  // have exited.
+  // TODO(vinod): Ensure all status updates have been acknowledged.
+  if (flags.recover == "cleanup" && frameworks.size() == 0) {
+    cleanup();
+  }
+}
+
+
+void Slave::cleanup()
+{
+  CHECK(flags.recover == "cleanup");
+
+  LOG(INFO) << "Slave is shutting down because it is started with "
+            << " --recover==cleanup and all executors have terminated!";
+
+  string archiveDir = paths::getArchiveDir(flags.work_dir);
+  string metaDir = paths::getMetaRootDir(flags.work_dir);
+
+  // Archive and delete the meta directory, to allow incompatible upgrades.
+  LOG(INFO) << "Archiving and deleting the meta directory '" << metaDir
+            << "' to allow incompatible upgrade!";
+
+  // Create the archive directory, if it doesn't exist.
+  Try<Nothing> result = os::mkdir(archiveDir);
+  if (result.isSome()) {
+    result = os::tar(
+        metaDir, path::join(archiveDir, info.id().value() + ".tar.gz"));
+
+    if (result.isError()) {
+      LOG(ERROR) << "Failed to archive meta directory '" << archiveDir
+                 << "': " << result.error();
+    }
+  } else {
+    LOG(ERROR) << "Failed to create archive directory '" << archiveDir
+               << ": " << result.error();
+  }
+
+  result = os::rmdir(metaDir);
+  if (result.isError()) {
+    LOG(ERROR) << "Failed to delete meta directory '" << metaDir << "'";
+  }
+
+  shutdown();
 }
 
 
@@ -1661,17 +1698,15 @@ Future<Nothing> Slave::recover(bool reco
 {
   const string& metaDir = paths::getMetaRootDir(flags.work_dir);
 
-  // We consider the absence of 'metaDir' to mean that this is the
-  // very first time this slave was started with checkpointing
-  // enabled.
+  // We consider the absence of 'metaDir' to mean that this is either
+  // the first time this slave was started with checkpointing enabled
+  // or this slave was started after an upgrade (--recover=cleanup).
   if (!os::exists(metaDir)) {
     // NOTE: We recover the isolation module here to cleanup any old
     // executors (e.g: orphaned cgroups).
     return dispatch(isolationModule, &IsolationModule::recover, None());
   }
 
-  // TODO(vinod): Check for version and slaveinfo compatibility.
-
   // First, recover the slave state.
   Result<SlaveState> state = state::recover(metaDir, safe);
   if (state.isError()) {
@@ -1685,6 +1720,23 @@ Future<Nothing> Slave::recover(bool reco
     return dispatch(isolationModule, &IsolationModule::recover, None());
   }
 
+  // Check for SlaveInfo compatibility.
+  // TODO(vinod): Also check for version compatibility.
+  // NOTE: We set the 'id' field in 'info' from the recovered state,
+  // as a hack to compare the info created from options/flags with
+  // the recovered info.
+  info.mutable_id()->CopyFrom(state.get().id);
+  if (reconnect && !(info == state.get().info.get())) {
+    EXIT(1)
+      << "Incompatible slave info detected.\n"
+      << "Old slave info:\n" << state.get().info.get() << "\n"
+      << "New slave info:\n" << info << "\n"
+      << "To properly upgrade the slave do as follows:\n"
+      << "Step 1: Start the slave (old slave info) with --recover=cleanup.\n"
+      << "Step 2: Wait till the slave kills all executors and shuts down.\n"
+      << "Step 3: Start the upgraded slave (new slave info).\n";
+  }
+
   info = state.get().info.get(); // Recover the slave info.
 
   // Recover the status update manager, then the isolation module and

Modified: incubator/mesos/trunk/src/slave/slave.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/slave/slave.hpp?rev=1455818&r1=1455817&r2=1455818&view=diff
==============================================================================
--- incubator/mesos/trunk/src/slave/slave.hpp (original)
+++ incubator/mesos/trunk/src/slave/slave.hpp Wed Mar 13 06:24:06 2013
@@ -224,6 +224,10 @@ protected:
       const state::SlaveState& state,
       bool reconnect);
 
+  // Called when the slave is started in 'cleanup' recovery mode and
+  // all the executors have terminated.
+  void cleanup();
+
 private:
   Slave(const Slave&);              // No copying.
   Slave& operator = (const Slave&); // No assigning.

Modified: incubator/mesos/trunk/third_party/libprocess/third_party/stout/include/stout/os.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/third_party/libprocess/third_party/stout/include/stout/os.hpp?rev=1455818&r1=1455817&r2=1455818&view=diff
==============================================================================
--- incubator/mesos/trunk/third_party/libprocess/third_party/stout/include/stout/os.hpp (original)
+++ incubator/mesos/trunk/third_party/libprocess/third_party/stout/include/stout/os.hpp Wed Mar 13 06:24:06 2013
@@ -845,6 +845,23 @@ inline Try<int> shell(std::ostream* os, 
 }
 
 
+// Creates a tar 'archive' with gzip compression, of the given 'path'.
+inline Try<Nothing> tar(const std::string& path, const std::string& archive)
+{
+  Try<int> status =
+    shell(NULL, "tar -czf %s %s", archive.c_str(), path.c_str());
+
+  if (status.isError()) {
+    return Error("Failed to archive " + path + ": " + status.error());
+  } else if (status.get() != 0) {
+    return Error("Non-zero exit status when archiving " + path +
+                 ": " + stringify(status.get()));
+  }
+
+  return Nothing();
+}
+
+
 // Returns the list of files that match the given (shell) pattern.
 inline Try<std::list<std::string> > glob(const std::string& pattern)
 {