You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/04/25 23:32:18 UTC

[25/50] mesos git commit: Re-checkpoint frameworks after agent recovery.

Re-checkpoint frameworks after agent recovery.

When performing an upgrade cycle, it is possible for a 0.24 and
later agent to recover from a framework checkpoint written by 0.22
or earlier. In this case, we need to compatibly accept a missing
FrameworkID, and then rewrite the framework checkpoint so that
subsequent upgrades don't hit the same problem.

Review: https://reviews.apache.org/r/40177


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/040b0677
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/040b0677
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/040b0677

Branch: refs/heads/0.26.x
Commit: 040b0677d30d6c115a9f8d1182cbbd376ee6f89b
Parents: 8a5f9a9
Author: James Peach <jp...@apache.org>
Authored: Mon Nov 23 15:31:05 2015 -0800
Committer: Michael Park <mp...@apache.org>
Committed: Fri Feb 26 20:59:06 2016 -0800

----------------------------------------------------------------------
 src/slave/slave.cpp | 55 ++++++++++++++++++++++++++++++++----------------
 src/slave/slave.hpp |  1 +
 2 files changed, 38 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/040b0677/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 05b4b4f..cf38393 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1343,6 +1343,9 @@ void Slave::runTask(
 
     framework = new Framework(this, frameworkInfo, frameworkPid);
     frameworks[frameworkId] = framework;
+    if (frameworkInfo.checkpoint()) {
+      framework->checkpointFramework();
+    }
 
     // Is this same framework in completedFrameworks? If so, move the completed
     // executors to this framework and remove it from that list.
@@ -4333,7 +4336,18 @@ void Slave::recoverFramework(const FrameworkState& state)
 
   CHECK_SOME(state.info);
   FrameworkInfo frameworkInfo = state.info.get();
+
+  // Mesos 0.22 and earlier didn't write the FrameworkID into the FrameworkInfo.
+  // In this case, we we update FrameworkInfo.framework_id from directory name,
+  // and rewrite the new format when we are done.
+  bool recheckpoint = false;
+  if (!frameworkInfo.has_id()) {
+    frameworkInfo.mutable_id()->CopyFrom(state.id);
+    recheckpoint = true;
+  }
+
   CHECK(frameworkInfo.has_id());
+  CHECK(frameworkInfo.checkpoint());
 
   // In 0.24.0, HTTP schedulers are supported and these do not
   // have a 'pid'. In this case, the slave will checkpoint UPID().
@@ -4348,6 +4362,10 @@ void Slave::recoverFramework(const FrameworkState& state)
   Framework* framework = new Framework(this, frameworkInfo, pid);
   frameworks[framework->id()] = framework;
 
+  if (recheckpoint) {
+    framework->checkpointFramework();
+  }
+
   // Now recover the executors for this framework.
   foreachvalue (const ExecutorState& executorState, state.executors) {
     framework->recoverExecutor(executorState);
@@ -4910,30 +4928,31 @@ Framework::Framework(
     slave(_slave),
     info(_info),
     pid(_pid),
-    completedExecutors(MAX_COMPLETED_EXECUTORS_PER_FRAMEWORK)
+    completedExecutors(MAX_COMPLETED_EXECUTORS_PER_FRAMEWORK) {}
+
+
+void Framework::checkpointFramework() const
 {
-  if (info.checkpoint() && slave->state != slave->RECOVERING) {
-    // Checkpoint the framework info.
-    string path = paths::getFrameworkInfoPath(
-        slave->metaDir, slave->info.id(), id());
+  // Checkpoint the framework info.
+  string path = paths::getFrameworkInfoPath(
+      slave->metaDir, slave->info.id(), id());
 
-    VLOG(1) << "Checkpointing FrameworkInfo to '" << path << "'";
+  VLOG(1) << "Checkpointing FrameworkInfo to '" << path << "'";
 
-    CHECK_SOME(state::checkpoint(path, info));
+  CHECK_SOME(state::checkpoint(path, info));
 
-    // Checkpoint the framework pid, note that we checkpoint a
-    // UPID() when it is None (for HTTP schedulers) because
-    // 0.23.x slaves consider a missing pid file to be an
-    // error.
-    path = paths::getFrameworkPidPath(
-        slave->metaDir, slave->info.id(), id());
+  // Checkpoint the framework pid, note that we checkpoint a
+  // UPID() when it is None (for HTTP schedulers) because
+  // 0.23.x slaves consider a missing pid file to be an
+  // error.
+  path = paths::getFrameworkPidPath(
+      slave->metaDir, slave->info.id(), id());
 
-    VLOG(1) << "Checkpointing framework pid"
-            << " '" << pid.getOrElse(UPID()) << "'"
-            << " to '" << path << "'";
+  VLOG(1) << "Checkpointing framework pid"
+          << " '" << pid.getOrElse(UPID()) << "'"
+          << " to '" << path << "'";
 
-    CHECK_SOME(state::checkpoint(path, pid.getOrElse(UPID())));
-  }
+  CHECK_SOME(state::checkpoint(path, pid.getOrElse(UPID())));
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/040b0677/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index ec2dfa9..0637e7a 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -745,6 +745,7 @@ struct Framework
   Executor* getExecutor(const ExecutorID& executorId);
   Executor* getExecutor(const TaskID& taskId);
   void recoverExecutor(const state::ExecutorState& state);
+  void checkpointFramework() const;
 
   const FrameworkID id() const { return info.id(); }