You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/04/25 23:32:18 UTC
[25/50] mesos git commit: Re-checkpoint frameworks after agent
recovery.
Re-checkpoint frameworks after agent recovery.
When performing an upgrade cycle, it is possible for a 0.24 and
later agent to recover from a framework checkpoint written by 0.22
or earlier. In this case, we need to compatibly accept a missing
FrameworkID, and then rewrite the framework checkpoint so that
subsequent upgrades don't hit the same problem.
Review: https://reviews.apache.org/r/40177
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/040b0677
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/040b0677
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/040b0677
Branch: refs/heads/0.26.x
Commit: 040b0677d30d6c115a9f8d1182cbbd376ee6f89b
Parents: 8a5f9a9
Author: James Peach <jp...@apache.org>
Authored: Mon Nov 23 15:31:05 2015 -0800
Committer: Michael Park <mp...@apache.org>
Committed: Fri Feb 26 20:59:06 2016 -0800
----------------------------------------------------------------------
src/slave/slave.cpp | 55 ++++++++++++++++++++++++++++++++----------------
src/slave/slave.hpp | 1 +
2 files changed, 38 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/040b0677/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 05b4b4f..cf38393 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1343,6 +1343,9 @@ void Slave::runTask(
framework = new Framework(this, frameworkInfo, frameworkPid);
frameworks[frameworkId] = framework;
+ if (frameworkInfo.checkpoint()) {
+ framework->checkpointFramework();
+ }
// Is this same framework in completedFrameworks? If so, move the completed
// executors to this framework and remove it from that list.
@@ -4333,7 +4336,18 @@ void Slave::recoverFramework(const FrameworkState& state)
CHECK_SOME(state.info);
FrameworkInfo frameworkInfo = state.info.get();
+
+ // Mesos 0.22 and earlier didn't write the FrameworkID into the FrameworkInfo.
+ // In this case, we we update FrameworkInfo.framework_id from directory name,
+ // and rewrite the new format when we are done.
+ bool recheckpoint = false;
+ if (!frameworkInfo.has_id()) {
+ frameworkInfo.mutable_id()->CopyFrom(state.id);
+ recheckpoint = true;
+ }
+
CHECK(frameworkInfo.has_id());
+ CHECK(frameworkInfo.checkpoint());
// In 0.24.0, HTTP schedulers are supported and these do not
// have a 'pid'. In this case, the slave will checkpoint UPID().
@@ -4348,6 +4362,10 @@ void Slave::recoverFramework(const FrameworkState& state)
Framework* framework = new Framework(this, frameworkInfo, pid);
frameworks[framework->id()] = framework;
+ if (recheckpoint) {
+ framework->checkpointFramework();
+ }
+
// Now recover the executors for this framework.
foreachvalue (const ExecutorState& executorState, state.executors) {
framework->recoverExecutor(executorState);
@@ -4910,30 +4928,31 @@ Framework::Framework(
slave(_slave),
info(_info),
pid(_pid),
- completedExecutors(MAX_COMPLETED_EXECUTORS_PER_FRAMEWORK)
+ completedExecutors(MAX_COMPLETED_EXECUTORS_PER_FRAMEWORK) {}
+
+
+void Framework::checkpointFramework() const
{
- if (info.checkpoint() && slave->state != slave->RECOVERING) {
- // Checkpoint the framework info.
- string path = paths::getFrameworkInfoPath(
- slave->metaDir, slave->info.id(), id());
+ // Checkpoint the framework info.
+ string path = paths::getFrameworkInfoPath(
+ slave->metaDir, slave->info.id(), id());
- VLOG(1) << "Checkpointing FrameworkInfo to '" << path << "'";
+ VLOG(1) << "Checkpointing FrameworkInfo to '" << path << "'";
- CHECK_SOME(state::checkpoint(path, info));
+ CHECK_SOME(state::checkpoint(path, info));
- // Checkpoint the framework pid, note that we checkpoint a
- // UPID() when it is None (for HTTP schedulers) because
- // 0.23.x slaves consider a missing pid file to be an
- // error.
- path = paths::getFrameworkPidPath(
- slave->metaDir, slave->info.id(), id());
+ // Checkpoint the framework pid, note that we checkpoint a
+ // UPID() when it is None (for HTTP schedulers) because
+ // 0.23.x slaves consider a missing pid file to be an
+ // error.
+ path = paths::getFrameworkPidPath(
+ slave->metaDir, slave->info.id(), id());
- VLOG(1) << "Checkpointing framework pid"
- << " '" << pid.getOrElse(UPID()) << "'"
- << " to '" << path << "'";
+ VLOG(1) << "Checkpointing framework pid"
+ << " '" << pid.getOrElse(UPID()) << "'"
+ << " to '" << path << "'";
- CHECK_SOME(state::checkpoint(path, pid.getOrElse(UPID())));
- }
+ CHECK_SOME(state::checkpoint(path, pid.getOrElse(UPID())));
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/040b0677/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index ec2dfa9..0637e7a 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -745,6 +745,7 @@ struct Framework
Executor* getExecutor(const ExecutorID& executorId);
Executor* getExecutor(const TaskID& taskId);
void recoverExecutor(const state::ExecutorState& state);
+ void checkpointFramework() const;
const FrameworkID id() const { return info.id(); }