You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2015/05/14 01:59:27 UTC

[3/4] mesos git commit: Added reason metrics for slave removals.

Added reason metrics for slave removals.

Review: https://reviews.apache.org/r/33154


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/cc395509
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/cc395509
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/cc395509

Branch: refs/heads/master
Commit: cc395509a80facbdb22102920430b7f0e04bbc03
Parents: 23a51be
Author: Benjamin Mahler <be...@gmail.com>
Authored: Mon Apr 13 17:19:16 2015 -0700
Committer: Benjamin Mahler <be...@gmail.com>
Committed: Wed May 13 16:45:30 2015 -0700

----------------------------------------------------------------------
 include/mesos/mesos.proto |  7 +++++--
 src/master/master.cpp     | 41 ++++++++++++++++++++++++++++++-----------
 src/master/master.hpp     | 14 ++++++++++++--
 src/master/metrics.cpp    | 12 ++++++++++++
 src/master/metrics.hpp    |  3 +++
 5 files changed, 62 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index db4fc8c..15f55a3 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -753,14 +753,17 @@ enum TaskState {
  * Describes the current status of a task.
  */
 message TaskStatus {
-  /** Describes the source of the task status update. */
+  // Describes the source of the task status update.
   enum Source {
     SOURCE_MASTER = 0;
     SOURCE_SLAVE = 1;
     SOURCE_EXECUTOR = 2;
   }
 
-  /** Detailed reason for the task status update. */
+  // Detailed reason for the task status update.
+  //
+  // TODO(bmahler): Differentiate between slave removal reasons
+  // (e.g. unhealthy vs. unregistered for maintenance).
   enum Reason {
     REASON_COMMAND_EXECUTOR_FAILED = 0;
     REASON_EXECUTOR_TERMINATED = 1;

http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index ec32cd6..eaea79d 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -993,7 +993,8 @@ void Master::exited(const UPID& pid)
         // Remove the slave, if it is not checkpointing.
         LOG(INFO) << "Removing disconnected slave " << *slave
                   << " because it is not checkpointing!";
-        removeSlave(slave);
+        removeSlave(slave,
+                    "slave is non-checkpointing and disconnected");
         return;
       } else if (slave->connected) {
         // Checkpointing slaves can just be disconnected.
@@ -1380,7 +1381,9 @@ Nothing Master::removeSlave(const Registry::Slave& slave)
                    &Self::_removeSlave,
                    slave.info(),
                    vector<StatusUpdate>(), // No TASK_LOST updates to send.
-                   lambda::_1));
+                   lambda::_1,
+                   "did not re-register after master failover",
+                   metrics->slave_removals_reason_unhealthy));
   } else {
     // When a non-strict registry is in use, we want to ensure the
     // registry is used in a write-only manner. Therefore we remove
@@ -3101,7 +3104,9 @@ void Master::registerSlave(
         LOG(INFO)
           << "Removing old disconnected slave " << *slave
           << " because a registration attempt is being made from " << from;
-        removeSlave(slave);
+        removeSlave(slave,
+                    "a new slave registered at the same address",
+                    metrics->slave_removals_reason_registered);
         break;
       } else {
         CHECK(slave->active)
@@ -3437,7 +3442,9 @@ void Master::unregisterSlave(const UPID& from, const SlaveID& slaveId)
                    << " because it is not the slave " << slave->pid;
       return;
     }
-    removeSlave(slave);
+    removeSlave(slave,
+                "the slave unregistered",
+                metrics->slave_removals_reason_unregistered);
   }
 }
 
@@ -3646,7 +3653,7 @@ void Master::shutdownSlave(const SlaveID& slaveId, const string& message)
   message_.set_message(message);
   send(slave->pid, message_);
 
-  removeSlave(slave);
+  removeSlave(slave, message, metrics->slave_removals_reason_unhealthy);
 }
 
 
@@ -4664,11 +4671,14 @@ void Master::addSlave(
 }
 
 
-void Master::removeSlave(Slave* slave)
+void Master::removeSlave(
+    Slave* slave,
+    const string& message,
+    Option<Counter> reason)
 {
   CHECK_NOTNULL(slave);
 
-  LOG(INFO) << "Removing slave " << *slave;
+  LOG(INFO) << "Removing slave " << *slave << ": " << message;
 
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
@@ -4692,7 +4702,7 @@ void Master::removeSlave(Slave* slave)
           task->task_id(),
           TASK_LOST,
           TaskStatus::SOURCE_MASTER,
-          "Slave " + slave->info.hostname() + " removed",
+          "Slave " + slave->info.hostname() + " removed: " + message,
           TaskStatus::REASON_SLAVE_REMOVED,
           (task->has_executor_id() ?
               Option<ExecutorID>(task->executor_id()) : None()));
@@ -4743,7 +4753,9 @@ void Master::removeSlave(Slave* slave)
                  &Self::_removeSlave,
                  slave->info,
                  updates,
-                 lambda::_1));
+                 lambda::_1,
+                 message,
+                 reason));
 
   delete slave;
 }
@@ -4752,7 +4764,9 @@ void Master::removeSlave(Slave* slave)
 void Master::_removeSlave(
     const SlaveInfo& slaveInfo,
     const vector<StatusUpdate>& updates,
-    const Future<bool>& removed)
+    const Future<bool>& removed,
+    const string& message,
+    Option<Counter> reason)
 {
   slaves.removing.erase(slaveInfo.id());
 
@@ -4769,9 +4783,14 @@ void Master::_removeSlave(
     << "already removed from the registrar";
 
   LOG(INFO) << "Removed slave " << slaveInfo.id() << " ("
-            << slaveInfo.hostname() << ")";
+            << slaveInfo.hostname() << "): " << message;
+
   ++metrics->slave_removals;
 
+  if (reason.isSome()) {
+    ++utils::copy(reason.get()); // Remove const.
+  }
+
   // Forward the LOST updates on to the framework.
   foreach (const StatusUpdate& update, updates) {
     Framework* framework = getFramework(update.framework_id());

http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index af41216..da0a835 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -44,6 +44,8 @@
 #include <process/protobuf.hpp>
 #include <process/timer.hpp>
 
+#include <process/metrics/counter.hpp>
+
 #include <stout/cache.hpp>
 #include <stout/foreach.hpp>
 #include <stout/hashmap.hpp>
@@ -363,11 +365,19 @@ protected:
   Nothing removeSlave(const Registry::Slave& slave);
 
   // Remove the slave from the registrar and from the master's state.
-  void removeSlave(Slave* slave);
+  //
+  // TODO(bmahler): 'reason' is optional until MESOS-2317 is resolved.
+  void removeSlave(
+      Slave* slave,
+      const std::string& message,
+      Option<process::metrics::Counter> reason = None());
+
   void _removeSlave(
       const SlaveInfo& slaveInfo,
       const std::vector<StatusUpdate>& updates,
-      const process::Future<bool>& removed);
+      const process::Future<bool>& removed,
+      const std::string& message,
+      Option<process::metrics::Counter> reason = None());
 
   // Authorizes the task.
   // Returns true if task is authorized.

http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 973f051..ee09664 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -148,6 +148,12 @@ Metrics::Metrics(const Master& master)
         "master/slave_reregistrations"),
     slave_removals(
         "master/slave_removals"),
+    slave_removals_reason_unhealthy(
+        "master/slave_removals/reason_unhealthy"),
+    slave_removals_reason_unregistered(
+        "master/slave_removals/reason_unregistered"),
+    slave_removals_reason_registered(
+        "master/slave_removals/reason_registered"),
     slave_shutdowns_scheduled(
         "master/slave_shutdowns_scheduled"),
     slave_shutdowns_completed(
@@ -224,6 +230,9 @@ Metrics::Metrics(const Master& master)
   process::metrics::add(slave_registrations);
   process::metrics::add(slave_reregistrations);
   process::metrics::add(slave_removals);
+  process::metrics::add(slave_removals_reason_unhealthy);
+  process::metrics::add(slave_removals_reason_unregistered);
+  process::metrics::add(slave_removals_reason_registered);
 
   process::metrics::add(slave_shutdowns_scheduled);
   process::metrics::add(slave_shutdowns_completed);
@@ -327,6 +336,9 @@ Metrics::~Metrics()
   process::metrics::remove(slave_registrations);
   process::metrics::remove(slave_reregistrations);
   process::metrics::remove(slave_removals);
+  process::metrics::remove(slave_removals_reason_unhealthy);
+  process::metrics::remove(slave_removals_reason_unregistered);
+  process::metrics::remove(slave_removals_reason_registered);
 
   process::metrics::remove(slave_shutdowns_scheduled);
   process::metrics::remove(slave_shutdowns_completed);

http://git-wip-us.apache.org/repos/asf/mesos/blob/cc395509/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index ee3982e..78d0666 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -163,6 +163,9 @@ struct Metrics
   process::metrics::Counter slave_registrations;
   process::metrics::Counter slave_reregistrations;
   process::metrics::Counter slave_removals;
+  process::metrics::Counter slave_removals_reason_unhealthy;
+  process::metrics::Counter slave_removals_reason_unregistered;
+  process::metrics::Counter slave_removals_reason_registered;
 
   // Slave observer metrics.
   process::metrics::Counter slave_shutdowns_scheduled;