You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ya...@apache.org on 2017/07/13 00:46:49 UTC
[1/2] mesos git commit: Recover as a new agent in case of agent info
mismatch on reboot.
Repository: mesos
Updated Branches:
refs/heads/master c35293ef8 -> cd6495e67
Recover as a new agent in case of agent info mismatch on reboot.
This is for backwards compatibility. Prior to Mesos 1.4 we directly
bypass the state recovery and start as a new agent upon reboot
(introduced in MESOS-844). This unnecessarily discards the existing
agent ID (MESOS-6223). Starting in Mesos 1.4 we'll attempt to recover
the slave state even after reboot but in case of slave info mismatch
we'll fall back to recovering as a new agent (existing behavior). This
prevents the agent from flapping if the agent info (resources,
attributes, etc.) change is due to host maintenance associated with
the reboot.
Review: https://reviews.apache.org/r/60105/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/cd6495e6
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/cd6495e6
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/cd6495e6
Branch: refs/heads/master
Commit: cd6495e677ec74fd3f40b0dbf3b9654475308575
Parents: 91f4e9a
Author: Megha Sharma <ms...@apple.com>
Authored: Mon Jul 10 09:38:28 2017 -0700
Committer: Jiang Yan Xu <xu...@apple.com>
Committed: Wed Jul 12 17:43:25 2017 -0700
----------------------------------------------------------------------
src/slave/slave.cpp | 58 +++++++++++++++++++++++++++++++++---------------
1 file changed, 40 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/cd6495e6/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index beb0c79..64d2411 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -6018,37 +6018,59 @@ Future<Nothing> Slave::recover(const Try<state::State>& state)
}
if (slaveState.isSome() && slaveState->info.isSome()) {
+ if (slaveState->errors > 0) {
+ LOG(WARNING) << "Errors encountered during agent recovery: "
+ << slaveState->errors;
+
+ metrics.recovery_errors += slaveState->errors;
+ }
+
// Check for SlaveInfo compatibility.
// TODO(vinod): Also check for version compatibility.
- // NOTE: We set the 'id' field in 'info' from the recovered slave,
- // as a hack to compare the info created from options/flags with
- // the recovered info.
- info.mutable_id()->CopyFrom(slaveState->id);
+
+ SlaveInfo _info(info);
+ _info.mutable_id()->CopyFrom(slaveState->id);
if (flags.recover == "reconnect" &&
- !(info == slaveState->info.get())) {
- return Failure(strings::join(
+ !(_info == slaveState->info.get())) {
+ string message = strings::join(
"\n",
"Incompatible agent info detected.",
"------------------------------------------------------------",
"Old agent info:\n" + stringify(slaveState->info.get()),
"------------------------------------------------------------",
"New agent info:\n" + stringify(info),
- "------------------------------------------------------------"));
- }
+ "------------------------------------------------------------");
- info = slaveState->info.get(); // Recover the slave info.
+ // Fail the recovery unless the agent is recovering for the first
+ // time after host reboot.
+ //
+ // Prior to Mesos 1.4 we directly bypass the state recovery and
+ // start as a new agent upon reboot (introduced in MESOS-844).
+ // This unncessarily discards the existing agent ID (MESOS-6223).
+ // Starting in Mesos 1.4 we'll attempt to recover the slave state
+ // even after reboot but in case of slave info mismatch we'll fall
+ // back to recovering as a new agent (existing behavior). This
+ // prevents the agent from flapping if the slave info (resources,
+ // attributes, etc.) change is due to host maintenance associated
+ // with the reboot.
+ if (!state->rebooted) {
+ return Failure(message);
+ }
- if (slaveState->errors > 0) {
- LOG(WARNING) << "Errors encountered during agent recovery: "
- << slaveState->errors;
+ LOG(WARNING) << "Falling back to recover as a new agent due to error: "
+ << message;
- metrics.recovery_errors += slaveState->errors;
- }
+ // Cleaning up the slave state to avoid any state recovery for the
+ // old agent.
+ slaveState = None();
+ } else {
+ info = slaveState->info.get(); // Recover the slave info.
- // Recover the frameworks.
- foreachvalue (const FrameworkState& frameworkState,
- slaveState->frameworks) {
- recoverFramework(frameworkState, injectedExecutors, injectedTasks);
+ // Recover the frameworks.
+ foreachvalue (const FrameworkState& frameworkState,
+ slaveState->frameworks) {
+ recoverFramework(frameworkState, injectedExecutors, injectedTasks);
+ }
}
}
[2/2] mesos git commit: Stopped short-circuiting agent recovery upon
reboot.
Posted by ya...@apache.org.
Stopped short-circuiting agent recovery upon reboot.
The agent would continue the recovery and we added a `rebooted` flag
to `slave::State` to record the reboot info.
Review: https://reviews.apache.org/r/60104/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/91f4e9ac
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/91f4e9ac
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/91f4e9ac
Branch: refs/heads/master
Commit: 91f4e9acd0bad60201155b68a896d12d7200eda3
Parents: c35293e
Author: Megha Sharma <ms...@apple.com>
Authored: Mon Jul 10 09:34:40 2017 -0700
Committer: Jiang Yan Xu <xu...@apple.com>
Committed: Wed Jul 12 17:43:25 2017 -0700
----------------------------------------------------------------------
src/slave/state.cpp | 3 +--
src/slave/state.hpp | 1 +
2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/91f4e9ac/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 5dd8b1c..24efd4b 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -82,7 +82,6 @@ Try<State> recover(const string& rootDir, bool strict)
// resources checkpoint file.
state.resources = resources.get();
- // If the machine has rebooted, skip recovering slave state.
const string& bootIdPath = paths::getBootIdPath(rootDir);
if (os::exists(bootIdPath)) {
Try<string> read = os::read(bootIdPath);
@@ -95,7 +94,7 @@ Try<State> recover(const string& rootDir, bool strict)
if (id.get() != strings::trim(read.get())) {
LOG(INFO) << "Agent host rebooted";
- return state;
+ state.rebooted = true;
}
}
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/91f4e9ac/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 537358c..18c4319 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -316,6 +316,7 @@ struct State
Option<ResourcesState> resources;
Option<SlaveState> slave;
+ bool rebooted = false;
// TODO(jieyu): Consider using a vector of Option<Error> here so
// that we can print all the errors. This also applies to all the