You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ya...@apache.org on 2017/07/13 00:46:49 UTC

[1/2] mesos git commit: Recover as a new agent in case of agent info mismatch on reboot.

Repository: mesos
Updated Branches:
  refs/heads/master c35293ef8 -> cd6495e67


Recover as a new agent in case of agent info mismatch on reboot.

This is for backwards compatibility. Prior to Mesos 1.4 we directly
bypass the state recovery and start as a new agent upon reboot
(introduced in MESOS-844). This unnecessarily discards the existing
agent ID (MESOS-6223). Starting in Mesos 1.4 we'll attempt to recover
the slave state even after reboot but in case of slave info mismatch
we'll fall back to recovering as a new agent (existing behavior). This
prevents the agent from flapping if the agent info (resources,
attributes, etc.) change is due to host maintenance associated with
the reboot.

Review: https://reviews.apache.org/r/60105/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/cd6495e6
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/cd6495e6
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/cd6495e6

Branch: refs/heads/master
Commit: cd6495e677ec74fd3f40b0dbf3b9654475308575
Parents: 91f4e9a
Author: Megha Sharma <ms...@apple.com>
Authored: Mon Jul 10 09:38:28 2017 -0700
Committer: Jiang Yan Xu <xu...@apple.com>
Committed: Wed Jul 12 17:43:25 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp | 58 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/cd6495e6/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index beb0c79..64d2411 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -6018,37 +6018,59 @@ Future<Nothing> Slave::recover(const Try<state::State>& state)
   }
 
   if (slaveState.isSome() && slaveState->info.isSome()) {
+    if (slaveState->errors > 0) {
+      LOG(WARNING) << "Errors encountered during agent recovery: "
+                   << slaveState->errors;
+
+      metrics.recovery_errors += slaveState->errors;
+    }
+
     // Check for SlaveInfo compatibility.
     // TODO(vinod): Also check for version compatibility.
-    // NOTE: We set the 'id' field in 'info' from the recovered slave,
-    // as a hack to compare the info created from options/flags with
-    // the recovered info.
-    info.mutable_id()->CopyFrom(slaveState->id);
+
+    SlaveInfo _info(info);
+    _info.mutable_id()->CopyFrom(slaveState->id);
     if (flags.recover == "reconnect" &&
-        !(info == slaveState->info.get())) {
-      return Failure(strings::join(
+        !(_info == slaveState->info.get())) {
+      string message = strings::join(
           "\n",
           "Incompatible agent info detected.",
           "------------------------------------------------------------",
           "Old agent info:\n" + stringify(slaveState->info.get()),
           "------------------------------------------------------------",
           "New agent info:\n" + stringify(info),
-          "------------------------------------------------------------"));
-    }
+          "------------------------------------------------------------");
 
-    info = slaveState->info.get(); // Recover the slave info.
+      // Fail the recovery unless the agent is recovering for the first
+      // time after host reboot.
+      //
+      // Prior to Mesos 1.4 we directly bypass the state recovery and
+      // start as a new agent upon reboot (introduced in MESOS-844).
+      // This unncessarily discards the existing agent ID (MESOS-6223).
+      // Starting in Mesos 1.4 we'll attempt to recover the slave state
+      // even after reboot but in case of slave info mismatch we'll fall
+      // back to recovering as a new agent (existing behavior). This
+      // prevents the agent from flapping if the slave info (resources,
+      // attributes, etc.) change is due to host maintenance associated
+      // with the reboot.
+      if (!state->rebooted) {
+        return Failure(message);
+      }
 
-    if (slaveState->errors > 0) {
-      LOG(WARNING) << "Errors encountered during agent recovery: "
-                   << slaveState->errors;
+      LOG(WARNING) << "Falling back to recover as a new agent due to error: "
+                   << message;
 
-      metrics.recovery_errors += slaveState->errors;
-    }
+      // Cleaning up the slave state to avoid any state recovery for the
+      // old agent.
+      slaveState = None();
+    } else {
+      info = slaveState->info.get(); // Recover the slave info.
 
-    // Recover the frameworks.
-    foreachvalue (const FrameworkState& frameworkState,
-                  slaveState->frameworks) {
-      recoverFramework(frameworkState, injectedExecutors, injectedTasks);
+      // Recover the frameworks.
+      foreachvalue (const FrameworkState& frameworkState,
+                    slaveState->frameworks) {
+        recoverFramework(frameworkState, injectedExecutors, injectedTasks);
+      }
     }
   }
 


[2/2] mesos git commit: Stopped short-circuiting agent recovery upon reboot.

Posted by ya...@apache.org.
Stopped short-circuiting agent recovery upon reboot.

The agent would continue the recovery and we added a `rebooted` flag
to `slave::State` to record the reboot info.

Review: https://reviews.apache.org/r/60104/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/91f4e9ac
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/91f4e9ac
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/91f4e9ac

Branch: refs/heads/master
Commit: 91f4e9acd0bad60201155b68a896d12d7200eda3
Parents: c35293e
Author: Megha Sharma <ms...@apple.com>
Authored: Mon Jul 10 09:34:40 2017 -0700
Committer: Jiang Yan Xu <xu...@apple.com>
Committed: Wed Jul 12 17:43:25 2017 -0700

----------------------------------------------------------------------
 src/slave/state.cpp | 3 +--
 src/slave/state.hpp | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/91f4e9ac/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 5dd8b1c..24efd4b 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -82,7 +82,6 @@ Try<State> recover(const string& rootDir, bool strict)
   // resources checkpoint file.
   state.resources = resources.get();
 
-  // If the machine has rebooted, skip recovering slave state.
   const string& bootIdPath = paths::getBootIdPath(rootDir);
   if (os::exists(bootIdPath)) {
     Try<string> read = os::read(bootIdPath);
@@ -95,7 +94,7 @@ Try<State> recover(const string& rootDir, bool strict)
 
       if (id.get() != strings::trim(read.get())) {
         LOG(INFO) << "Agent host rebooted";
-        return state;
+        state.rebooted = true;
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/91f4e9ac/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 537358c..18c4319 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -316,6 +316,7 @@ struct State
 
   Option<ResourcesState> resources;
   Option<SlaveState> slave;
+  bool rebooted = false;
 
   // TODO(jieyu): Consider using a vector of Option<Error> here so
   // that we can print all the errors. This also applies to all the