You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by an...@apache.org on 2017/09/29 20:31:36 UTC

[03/11] mesos git commit: Added the mark agent gone handler on the master.

Added the mark agent gone handler on the master.

This change adds the neccessary logic for handling the mark agent
gone call on the master. Once an agent is marked as gone, it's
not allowed to re-register with the Mesos master. GC'ing the
list of gone agents (it can grow unbounded) would be added in
a separate change.

Review: https://reviews.apache.org/r/62478/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/57873eba
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/57873eba
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/57873eba

Branch: refs/heads/master
Commit: 57873ebacdad052024586cb28994e920aa7e6123
Parents: ff07dd8
Author: Anand Mazumdar <an...@apache.org>
Authored: Fri Sep 29 11:57:12 2017 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Fri Sep 29 13:31:08 2017 -0700

----------------------------------------------------------------------
 src/master/http.cpp       | 101 +++++++++++++++++++++++++++++++++++
 src/master/master.cpp     | 116 ++++++++++++++++++++++++++++++++++++++---
 src/master/master.hpp     |  17 ++++++
 src/master/validation.cpp |   6 +++
 4 files changed, 234 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/57873eba/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 28d0393..66a391f 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -779,6 +779,9 @@ Future<Response> Master::Http::api(
 
     case mesos::master::Call::TEARDOWN:
       return teardown(call, principal, acceptType);
+
+    case mesos::master::Call::MARK_AGENT_GONE:
+      return markAgentGone(call, principal, acceptType);
   }
 
   UNREACHABLE();
@@ -5286,6 +5289,104 @@ Future<Response> Master::Http::unreserveResources(
   return _unreserve(slaveId, resources, principal);
 }
 
+
+Future<Response> Master::Http::markAgentGone(
+    const mesos::master::Call& call,
+    const Option<Principal>& principal,
+    ContentType contentType) const
+{
+  CHECK_EQ(mesos::master::Call::MARK_AGENT_GONE, call.type());
+
+  const SlaveID& slaveId = call.mark_agent_gone().slave_id();
+
+  LOG(INFO) << "Marking agent '" << slaveId << "' as gone";
+
+  if (master->slaves.gone.contains(slaveId)) {
+    LOG(WARNING) << "Not marking agent '" << slaveId
+                 << "' as gone because it has already transitioned to gone";
+    return OK();
+  }
+
+  // We return a `ServiceUnavailable` (retryable error) if there is
+  // an ongoing registry transition to gone/removed/unreachable.
+  if (master->slaves.markingGone.contains(slaveId)) {
+    LOG(WARNING) << "Not marking agent '" << slaveId
+                 << "' as gone because another gone transition"
+                 << " is already in progress";
+
+    return ServiceUnavailable(
+        "Agent '" + stringify(slaveId) + "' is already being transitioned"
+        + " to gone");
+  }
+
+  if (master->slaves.removing.contains(slaveId)) {
+    LOG(WARNING) << "Not marking agent '" << slaveId
+                 << "' as gone because another remove transition"
+                 << " is already in progress";
+
+    return ServiceUnavailable(
+        "Agent '" + stringify(slaveId) + "' is being transitioned to removed");
+  }
+
+  if (master->slaves.markingUnreachable.contains(slaveId)) {
+    LOG(WARNING) << "Not marking agent '" << slaveId
+                 << "' as gone because another unreachable transition"
+                 << " is already in progress";
+
+    return ServiceUnavailable(
+        "Agent '" + stringify(slaveId) + "' is being transitioned to"
+        + " unreachable");
+  }
+
+  // We currently support marking an agent gone if the agent
+  // is present in the list of active, unreachable or recovered agents.
+  bool found = false;
+
+  if (master->slaves.registered.contains(slaveId)) {
+    found = true;
+  } else if(master->slaves.recovered.contains(slaveId)) {
+    found = true;
+  } else if (master->slaves.unreachable.contains(slaveId)) {
+    found = true;
+  }
+
+  if (!found) {
+    return NotFound("Agent '" + stringify(slaveId) + "' not found");
+  }
+
+  master->slaves.markingGone.insert(slaveId);
+
+  TimeInfo goneTime = protobuf::getCurrentTime();
+
+  Future<bool> gone = master->registrar->apply(Owned<Operation>(
+      new MarkSlaveGone(slaveId, goneTime)));
+
+  gone.onAny(defer(
+      master->self(), [this, slaveId, goneTime](Future<bool> registrarResult) {
+    CHECK(!registrarResult.isDiscarded());
+
+    if (registrarResult.isFailed()) {
+      LOG(FATAL) << "Failed to mark agent " << slaveId
+                 << " as gone in the registry: "
+                 << registrarResult.failure();
+    }
+
+    Slave* slave = master->slaves.registered.get(slaveId);
+
+    // This can happen if the agent that is being marked as
+    // gone is not currently registered (unreachable/recovered).
+    if (slave == nullptr) {
+      return;
+    }
+
+    master->markGone(slave, goneTime);
+  }));
+
+  return gone.then([]() -> Future<Response> {
+    return OK();
+  });
+}
+
 } // namespace master {
 } // namespace internal {
 } // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/57873eba/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index ee71c1d..0e0fb1d 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1723,6 +1723,11 @@ Future<Nothing> Master::_recover(const Registry& registry)
     slaves.unreachable[unreachable.id()] = unreachable.timestamp();
   }
 
+  foreach (const Registry::GoneSlave& gone,
+           registry.gone().slaves()) {
+    slaves.gone[gone.id()] = gone.timestamp();
+  }
+
   // Set up a timer for age-based registry GC.
   scheduleRegistryGc();
 
@@ -2013,6 +2018,26 @@ Nothing Master::markUnreachableAfterFailover(const SlaveInfo& slave)
     return Nothing();
   }
 
+  if (slaves.markingGone.contains(slave.id())) {
+    LOG(INFO) << "Canceling transition of agent "
+              << slave.id() << " (" << slave.hostname() << ")"
+              << " to unreachable because an agent gone"
+              << " operation is in progress";
+
+    ++metrics->slave_unreachable_canceled;
+    return Nothing();
+  }
+
+  if (slaves.gone.contains(slave.id())) {
+    LOG(INFO) << "Canceling transition of agent "
+              << slave.id() << " (" << slave.hostname() << ")"
+              << " to unreachable because the agent has"
+              << " been marked gone";
+
+    ++metrics->slave_unreachable_canceled;
+    return Nothing();
+  }
+
   LOG(WARNING) << "Agent " << slave.id()
                << " (" << slave.hostname() << ") did not re-register"
                << " within " << flags.agent_reregister_timeout
@@ -6056,6 +6081,24 @@ void Master::reregisterSlave(
     return;
   }
 
+  if (slaves.markingGone.contains(slaveInfo.id())) {
+    LOG(INFO)
+      << "Ignoring re-register agent message from agent "
+      << slaveInfo.id() << " at " << from << " ("
+      << slaveInfo.hostname() << ") as a gone operation is already in progress";
+    return;
+  }
+
+  if (slaves.gone.contains(slaveInfo.id())) {
+    LOG(WARNING) << "Refusing re-registration of agent at " << from
+                 << " because it is already marked gone";
+
+    ShutdownMessage message;
+    message.set_message("Agent has been marked gone");
+    send(from, message);
+    return;
+  }
+
   Option<Error> error = validation::master::message::reregisterSlave(
       slaveInfo, tasks, checkpointedResources, executorInfos, frameworks);
 
@@ -7122,6 +7165,20 @@ void Master::markUnreachable(const SlaveID& slaveId, const string& message)
     return;
   }
 
+  if (slaves.markingGone.contains(slaveId)) {
+    LOG(INFO) << "Canceling transition of agent " << slaveId
+              << " to unreachable because an agent gone"
+              << " operation is in progress";
+    return;
+  }
+
+  if (slaves.gone.contains(slaveId)) {
+    LOG(INFO) << "Canceling transition of agent " << slaveId
+              << " to unreachable because the agent has"
+              << " been marked gone";
+    return;
+  }
+
   LOG(INFO) << "Marking agent " << *slave
             << " unreachable: " << message;
 
@@ -7181,6 +7238,23 @@ void Master::_markUnreachable(
 }
 
 
+void Master::markGone(Slave* slave, const TimeInfo& goneTime)
+{
+  CHECK_NOTNULL(slave);
+  CHECK(slaves.markingGone.contains(slave->info.id()));
+  slaves.markingGone.erase(slave->info.id());
+
+  slaves.gone[slave->id] = goneTime;
+
+  // Shutdown the agent if it transitioned to gone.
+  ShutdownMessage message;
+  message.set_message("Agent has been marked gone");
+  send(slave->pid, message);
+
+  __removeSlave(slave, "Agent has been marked gone", None());
+}
+
+
 void Master::reconcile(
     Framework* framework,
     const scheduler::Call::Reconcile& reconcile)
@@ -7312,9 +7386,10 @@ void Master::_reconcileTasks(
   //   (3) Task is unknown, slave is registered: TASK_UNKNOWN.
   //   (4) Task is unknown, slave is transitioning: no-op.
   //   (5) Task is unknown, slave is unreachable: TASK_UNREACHABLE.
-  //   (6) Task is unknown, slave is unknown: TASK_UNKNOWN.
+  //   (6) Task is unknown, slave is gone: TASK_GONE_BY_OPERATOR.
+  //   (7) Task is unknown, slave is unknown: TASK_UNKNOWN.
   //
-  // For cases (3), (5), and (6), TASK_LOST is sent instead if the
+  // For cases (3), (5), (6) and (7) TASK_LOST is sent instead if the
   // framework has not opted-in to the PARTITION_AWARE capability.
   foreach (const TaskStatus& status, statuses) {
     Option<SlaveID> slaveId = None();
@@ -7411,8 +7486,26 @@ void Master::_reconcileTasks(
           None(),
           None(),
           unreachableTime);
+    } else if (slaveId.isSome() && slaves.gone.contains(slaveId.get())) {
+      // (6) Slave is gone: TASK_GONE_BY_OPERATOR. If the framework
+      // does not have the PARTITION_AWARE capability, send TASK_LOST
+      // for backward compatibility.
+      TaskState taskState = TASK_GONE_BY_OPERATOR;
+      if (!framework->capabilities.partitionAware) {
+        taskState = TASK_LOST;
+      }
+
+      update = protobuf::createStatusUpdate(
+          framework->id(),
+          slaveId.get(),
+          status.task_id(),
+          taskState,
+          TaskStatus::SOURCE_MASTER,
+          None(),
+          "Reconciliation: Task is gone",
+          TaskStatus::REASON_RECONCILIATION);
     } else {
-      // (6) Task is unknown, slave is unknown: TASK_UNKNOWN. If the
+      // (7) Task is unknown, slave is unknown: TASK_UNKNOWN. If the
       // framework does not have the PARTITION_AWARE capability, send
       // TASK_LOST for backward compatibility.
       TaskState taskState = TASK_UNKNOWN;
@@ -8665,6 +8758,12 @@ void Master::removeSlave(
     return;
   }
 
+  if (slaves.markingGone.contains(slave->id)) {
+    LOG(WARNING) << "Ignoring removal of agent " << *slave
+                 << " that is in the process of being marked gone";
+    return;
+  }
+
   // This should not be possible, but we protect against it anyway for
   // the sake of paranoia.
   if (slaves.removing.contains(slave->id)) {
@@ -8837,8 +8936,8 @@ void Master::__removeSlave(
   // the slave is already removed.
   allocator->removeSlave(slave->id);
 
-  // Transition tasks to TASK_UNREACHABLE/TASK_LOST
-  // and remove them. We only use TASK_UNREACHABLE if
+  // Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST
+  // and remove them. We only use TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR if
   // the framework has opted in to the PARTITION_AWARE capability.
   foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
     Framework* framework = getFramework(frameworkId);
@@ -8850,7 +8949,12 @@ void Master::__removeSlave(
     if (!framework->capabilities.partitionAware) {
       newTaskState = TASK_LOST;
     } else {
-      newTaskState = TASK_UNREACHABLE;
+      if (unreachableTime.isSome()) {
+        newTaskState = TASK_UNREACHABLE;
+      } else {
+        newTaskState = TASK_GONE_BY_OPERATOR;
+        newTaskReason = TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
+      }
     }
 
     foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/57873eba/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 8cbf940..a17378e 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -491,6 +491,8 @@ public:
       const SlaveID& slaveId,
       const std::string& message);
 
+  void markGone(Slave* slave, const TimeInfo& goneTime);
+
   void authenticate(
       const process::UPID& from,
       const process::UPID& pid);
@@ -1665,6 +1667,11 @@ private:
         const Option<process::http::authentication::Principal>& principal,
         ContentType contentType) const;
 
+    process::Future<process::http::Response> markAgentGone(
+        const mesos::master::Call& call,
+        const Option<process::http::authentication::Principal>& principal,
+        ContentType contentType) const;
+
     Master* master;
 
     // NOTE: The quota specific pieces of the Operator API are factored
@@ -1829,6 +1836,9 @@ private:
     // Slaves that are in the process of being marked unreachable.
     hashset<SlaveID> markingUnreachable;
 
+    // Slaves that are in the process of being marked gone.
+    hashset<SlaveID> markingGone;
+
     // This collection includes agents that have gracefully shutdown,
     // as well as those that have been marked unreachable. We keep a
     // cache here to prevent this from growing in an unbounded manner.
@@ -1849,6 +1859,13 @@ private:
     // `registry_max_agent_age`, and `registry_max_agent_count` flags.
     LinkedHashMap<SlaveID, TimeInfo> unreachable;
 
+    // Slaves that have been marked gone. We recover this from the
+    // registry, so it includes slaves marked as gone by other instances
+    // of the master. Note that we use a LinkedHashMap to ensure the order
+    // of elements here matches the order in the registry's gone list, which
+    // matches the order in which agents are marked gone.
+    LinkedHashMap<SlaveID, TimeInfo> gone;
+
     // This rate limiter is used to limit the removal of slaves failing
     // health checks.
     // NOTE: Using a 'shared_ptr' here is OK because 'RateLimiter' is

http://git-wip-us.apache.org/repos/asf/mesos/blob/57873eba/src/master/validation.cpp
----------------------------------------------------------------------
diff --git a/src/master/validation.cpp b/src/master/validation.cpp
index a6e6a90..01bc2e0 100644
--- a/src/master/validation.cpp
+++ b/src/master/validation.cpp
@@ -228,6 +228,12 @@ Option<Error> validate(
         return Error("Expecting 'teardown' to be present");
       }
       return None();
+
+    case mesos::master::Call::MARK_AGENT_GONE:
+      if (!call.has_mark_agent_gone()) {
+        return Error("Expecting 'mark_agent_gone' to be present");
+      }
+      return None();
   }
 
   UNREACHABLE();